Exemple #1
0
void testGzip()
{
	libmaus2::aio::CheckedInputStream CIS("configure");
	uint64_t t = 0;
	std::ostringstream ostr;
	{
		libmaus2::lz::GzipOutputStream GZOS(ostr);
		int c = -1;
		while ( ( c = CIS.get() ) >= 0 )
			GZOS.put(c);
			
		t = GZOS.terminate();		
	}
	
	CIS.clear();
	CIS.seekg(0);
	
	assert ( t == ostr.str().size() );
	
	std::istringstream istr(ostr.str());
	libmaus2::lz::BufferedGzipStream BGS(istr);
	
	int c = -1;
	while ( (c=CIS.get()) >= 0 )
	{
		int d = BGS.get();
		assert ( d == c );
	}
	assert ( BGS.get() < 0 );
}
Exemple #2
0
			static void concatenate(std::vector<std::string> const & infilenames, std::string const & outfilename, bool const removeinput = false)
			{
				uint64_t const n = ::libmaus2::gamma::GammaRLDecoder::getLength(infilenames);
				unsigned int const albits = infilenames.size() ? ::libmaus2::gamma::GammaRLDecoder::getAlBits(infilenames[0]) : 0;
				
				::libmaus2::aio::OutputStreamInstance COS(outfilename);
				::libmaus2::aio::SynchronousGenericOutput<uint64_t> SGO(COS,64);
				SGO.put(n);
				SGO.put(albits);
				SGO.flush();
				uint64_t const headerlen = 2*sizeof(uint64_t);
				
				std::vector < ::libmaus2::huffman::IndexEntry > index;
				uint64_t ioff = headerlen;
				
				for ( uint64_t i = 0; i < infilenames.size(); ++i )
				{
					uint64_t const indexpos = ::libmaus2::huffman::IndexLoaderBase::getIndexPos(infilenames[i]);
					uint64_t const datalen = indexpos-headerlen;
					
					// copy data
					::libmaus2::aio::InputStreamInstance CIS(infilenames[i]);
					CIS.seekg(headerlen);
					::libmaus2::util::GetFileSize::copy(CIS,COS,datalen);
					
					// add entries to index
					::libmaus2::huffman::IndexLoaderSequential indexdata(infilenames[i]);
					::libmaus2::huffman::IndexEntry ij = indexdata.getNext();
					
					// ::libmaus2::huffman::IndexDecoderData indexdata(infilenames[i]);
					for ( uint64_t j = 0; j < indexdata.numentries; ++j )
					{
						::libmaus2::huffman::IndexEntry ij1 = indexdata.getNext();
						/*
						::libmaus2::huffman::IndexEntry const ij  = indexdata.readEntry(j);
						::libmaus2::huffman::IndexEntry const ij1 = indexdata.readEntry(j+1);						
						*/
						index.push_back(::libmaus2::huffman::IndexEntry((ij.pos - headerlen) + ioff, ij1.kcnt - ij.kcnt, ij1.vcnt - ij.vcnt));
						
						ij = ij1;
					}
					
					// update position pointer
					ioff += datalen;
					
					if ( removeinput )
						libmaus2::aio::FileRemoval::removeFile(infilenames[i]);
				}

				// write index
				::libmaus2::aio::SynchronousGenericOutput<uint8_t> SGO8(COS,64*1024);
				::libmaus2::aio::SynchronousGenericOutput<uint8_t>::iterator_type it(SGO8);
				::libmaus2::bitio::FastWriteBitWriterStream8Std FWBWS(it);
				writeIndex(index,FWBWS,ioff);

				FWBWS.flush();
				SGO8.flush();
				COS.flush();
			}
Exemple #3
0
			static unique_ptr_type load(base_layer_type const & B, std::string const & fn)
			{
				libmaus::aio::CheckedInputStream CIS(fn);
				unique_ptr_type ptr(
                                                new this_type(CIS,B)
                                        );
				return UNIQUE_PTR_MOVE(ptr);
			}
Exemple #4
0
/**
 * compute character histogram in parallel
 **/
::libmaus::autoarray::AutoArray<uint64_t> computeCharHist(std::string const & inputfile)
{
	uint64_t const n = ::libmaus::util::GetFileSize::getFileSize(inputfile);
	
	#if defined(_OPENMP)
	uint64_t const numthreads = omp_get_max_threads();
	#else
	uint64_t const numthreads = 1;
	#endif

	uint64_t const packsize = (n + numthreads-1)/numthreads;

	::libmaus::parallel::OMPLock lock;
	::libmaus::autoarray::AutoArray<uint64_t> ghist(256);	
	#if defined(_OPENMP)
	#pragma omp parallel for
	#endif
	for ( int64_t t = 0; t < static_cast<int64_t>(numthreads); ++t )
	{
		uint64_t const low  = std::min(n,t*packsize);
		uint64_t const high = std::min(n,low+packsize);
		uint64_t const range = high-low;
		
		if ( range )
		{
			::libmaus::autoarray::AutoArray<uint64_t> lhist(ghist.size());	
			::libmaus::aio::CheckedInputStream CIS(inputfile);
			CIS.seekg(low);
			uint64_t const blocksize = 8192;
			uint64_t const numblocks = ((range)+blocksize-1)/blocksize;
			::libmaus::autoarray::AutoArray<uint8_t> B(blocksize);
			
			for ( uint64_t b = 0; b < numblocks; ++b )
			{
				uint64_t const llow = std::min(low + b*blocksize,high);
				uint64_t const lhigh = std::min(llow + blocksize,high);
				uint64_t const lrange = lhigh-llow;
				CIS.read ( reinterpret_cast<char *>(B.begin()), lrange );
				assert ( CIS.gcount() == static_cast<int64_t>(lrange) );
				for ( uint64_t i = 0; i < lrange; ++i )
					lhist[B[i]]++;
			}

			lock.lock();
			for ( uint64_t i = 0; i < lhist.size(); ++i )
				ghist[i] += lhist[i];
			lock.unlock();
		}
	}
	
	return ghist;
}
Exemple #5
0
libmaus2::util::OctetString::OctetString(
	std::string const & filename, 
	uint64_t offset, 
	uint64_t blength)
{	
	::libmaus2::aio::CheckedInputStream CIS(filename);
	uint64_t const fs = ::libmaus2::util::GetFileSize::getFileSize(CIS);
	offset = std::min(offset,fs);
	blength = std::min(blength,fs-offset);

	CIS.seekg(offset);
	A = ::libmaus2::autoarray::AutoArray<uint8_t>(blength,false);
	CIS.read(reinterpret_cast<char *>(A.begin()),blength);
}
			void mergeToDense(std::string const & outputfilename, uint64_t const n)
			{
				std::string const tmpfilename = tmpgen.getFileName();				
				libmaus::util::TempFileRemovalContainer::addTempFile(tmpfilename);

				if ( merge(tmpfilename) )
				{
					libmaus::aio::CheckedInputStream CIS(tmpfilename);
					libmaus::gamma::SparseGammaGapDecoder SGGD(CIS);
					libmaus::gamma::SparseGammaGapDecoder::iterator it = SGGD.begin();
					
					libmaus::gamma::GammaGapEncoder GGE(outputfilename);
					GGE.encode(it,n);

					remove(tmpfilename.c_str());
				}
			}
Exemple #7
0
void testsparsegammamerge()
{
	libmaus::util::TempFileNameGenerator tmpgen("tmp",3);
	libmaus::gamma::SparseGammaGapFileSet SGGF(tmpgen);
	std::map<uint64_t,uint64_t> refM;
	
	for ( uint64_t i = 0; i < 25;  ++i )
	{
		std::string const fn = tmpgen.getFileName();
		libmaus::aio::CheckedOutputStream COS(fn);
		libmaus::gamma::SparseGammaGapEncoder SGE(COS);
		
		SGE.encode(2*i,i+1);   refM[2*i]   += (i+1);
		SGE.encode(2*i+2,i+1); refM[2*i+2] += (i+1);
		SGE.encode(2*i+4,i+1); refM[2*i+4] += (i+1);
		SGE.term();
		
		SGGF.addFile(fn);
	}
	
	std::string const ffn = tmpgen.getFileName();
	SGGF.merge(ffn);
	
	libmaus::aio::CheckedInputStream CIS(ffn);
	libmaus::gamma::SparseGammaGapDecoder SGGD(CIS);
	for ( uint64_t i = 0; i < 60; ++i )
	{
		uint64_t dv = SGGD.decode();
		
		std::cerr << dv;
		if ( refM.find(i) != refM.end() )
		{
			std::cerr << "(" << refM.find(i)->second << ")";
			assert ( refM.find(i)->second == dv );
		}
		else
		{
			std::cerr << "(0)";
			assert ( dv == 0 );
		}
		std::cerr << ";";
	}
	std::cerr << std::endl;
	
	remove(ffn.c_str());
}
Exemple #8
0
			static ::libmaus2::autoarray::AutoArray<uint64_t> loadArray(std::string const & filename)
			{
				::libmaus2::aio::InputStreamInstance CIS(filename);
				return loadArray(CIS);
			}
int main(int argc, char * argv[])
{
	try
	{
		::libmaus::util::ArgInfo const arginfo(argc,argv);
		::libmaus::util::TempFileRemovalContainer::setup();
		
		::std::vector<std::string> const & inputfilenames = arginfo.restargs;
		char const * fasuffixes[] = { ".fa", ".fasta", 0 };
		std::string defoutname = libmaus::util::OutputFileNameTools::endClipLcp(inputfilenames,&fasuffixes[0]) + ".fa";
		while ( ::libmaus::util::GetFileSize::fileExists(defoutname) )
			defoutname += "_";
		std::string const fatempfilename = arginfo.getValue<std::string>("fatempfilename",defoutname);
		::libmaus::util::TempFileRemovalContainer::addTempFile(fatempfilename);
		
		// std::cerr << "output file name " << defoutname << std::endl;
		
		::std::vector< ::libmaus::fastx::FastAReader::RewriteInfo > const info = ::libmaus::fastx::FastAReader::rewriteFiles(inputfilenames,fatempfilename);
		
		std::map < std::string, uint64_t > fachr;
		::libmaus::autoarray::AutoArray < uint64_t > fapref(info.size()+1);
		for ( uint64_t i = 0; i < info.size(); ++i )
		{
			// std::cerr << info[i].valid << "\t" << info[i].idlen << "\t" << info[i].seqlen << "\t" << info[i].getIdPrefix() << std::endl;
			fachr[info[i].getIdPrefix()] = i;
			fapref [ i ] = info[i].getEntryLength() ;
		}
		fapref.prefixSums();
		for ( uint64_t i = 0; i < info.size(); ++i )
			fapref [ i ] += info[i].idlen + 2; // > + newline

		::libmaus::bambam::BamDecoder decoder(std::cin);
		::libmaus::bambam::BamHeader const & bamheader = decoder.bamheader;
		// std::vector< ::libmaus::bambam::Chromosome > chromosomes

		::libmaus::autoarray::AutoArray<uint8_t> uptab(256,false);
		for ( uint64_t j = 0; j < uptab.size(); ++j )
			uptab[j] = toupper(j);
		
		::libmaus::autoarray::AutoArray < ::libmaus::autoarray::AutoArray<uint8_t>::unique_ptr_type > text(bamheader.chromosomes.size());
		for ( uint64_t i = 0; i < bamheader.chromosomes.size(); ++i )
		{
			std::string const bamchrname = bamheader.chromosomes[i].name;
			if ( fachr.find(bamchrname) == fachr.end() )
			{
				::libmaus::exception::LibMausException se;
				se.getStream() << "Unable to find reference sequence " << bamchrname << " in fa file." << std::endl;
				se.finish();
				throw se;
			}
			uint64_t const faid = fachr.find(bamchrname)->second;
			if ( bamheader.chromosomes[i].len != info[faid].seqlen )
			{
				::libmaus::exception::LibMausException se;
				se.getStream() << "Reference sequence " << bamchrname << " has len " << bamheader.chromosomes[i].len << " in bam file but " << info[faid].seqlen << " in fa file." << std::endl;
				se.finish();
				throw se;
			}
			
			if ( bamheader.chromosomes.size() < 100 )
				std::cerr << "Loading sequence " << bamchrname << " of length " << info[faid].seqlen << std::endl;
			text [ i ] = UNIQUE_PTR_MOVE(::libmaus::autoarray::AutoArray<uint8_t>::unique_ptr_type(new ::libmaus::autoarray::AutoArray<uint8_t>(info[faid].seqlen,false)));
			::libmaus::aio::CheckedInputStream CIS(fatempfilename);
			CIS.seekg(fapref[faid]);
			CIS.read(reinterpret_cast<char *>(text[i]->begin()),info[faid].seqlen);
			// sanity check, next symbol in file should be a newline
			int c;
			c = CIS.get();
			assert ( c == '\n' );
			
			// convert to upper case
			for ( uint8_t * pa = text[i]->begin(); pa != text[i]->end(); ++pa )
				*pa = uptab[*pa];
		}
		
		for ( uint64_t i = 0; i < bamheader.chromosomes.size(); ++i )
		{
			assert ( text[i]->size() == bamheader.chromosomes[i].len );
		}
		
		uint64_t decoded = 0;
		
		::libmaus::bambam::BamWriter BW(std::cout,bamheader);
		
		while ( decoder.readAlignment() )
		{
			++decoded;
			
			if ( decoded % (1024*1024) == 0 )
			{
				std::cerr << "[V] " << decoded << std::endl;
			}
			
			::libmaus::bambam::BamAlignment & alignment = decoder.alignment;

			bool const cigok = checkCigarValid(alignment,bamheader,text);
			
			// if cigar is ok then keep alignment
			if ( cigok )
			{
				if ( !alignment.isUnmap() )
				{
					uint64_t seqpos = 0;
					uint64_t refpos = alignment.getPos();
					std::string const read = alignment.getRead();
					std::string modseq = read;
					::libmaus::autoarray::AutoArray<uint8_t> const & ctext = *(text[alignment.getRefID()]);
					
					std::ostringstream newcigarstream;

					for ( uint64_t i = 0; i < alignment.getNCigar(); ++i )
					{
						char const cop = alignment.getCigarFieldOpAsChar(i);
						int64_t const clen = alignment.getCigarFieldLength(i);
						
						switch ( cop )
						{
							// match/mismatch, increment both
							case 'M':
							{
								int64_t low = 0;
								
								while ( low != clen )
								{
									int64_t high = low;
									
									while ( high != clen && ctext[refpos] == read[seqpos] )
									{
										modseq[seqpos] = '=';
										++refpos, ++seqpos, ++ high;
									}
									if ( high != low )
										newcigarstream << high-low << "=";
										
									low = high;

									while ( high != clen && ctext[refpos] != read[seqpos] )
										++refpos, ++seqpos, ++ high;
									if ( high != low )
										newcigarstream << high-low << "X";
										
									low = high;
								}						
								
								break;
							}
							case '=':
							{
								refpos += clen;
								for ( int64_t j = 0; j < clen; ++j, ++seqpos )
									modseq[seqpos] = '=';
								newcigarstream << clen << cop; 
								break;
							}
							case 'X':
							{
								refpos += clen;
								seqpos += clen;
								newcigarstream << clen << cop; 
								break;
							}
							case 'P':
							case 'I':
							{
								seqpos += clen;
								newcigarstream << clen << cop; 
								break;
							}
							case 'N':
							case 'D':
							{
								refpos += clen;
								newcigarstream << clen << cop; 
								break;
							}
							case 'S':
							{
								seqpos += clen;
								newcigarstream << clen << cop; 
								break;
							}
							case 'H':
							{
								newcigarstream << clen << cop; 
								break;
							}
						}
					}
					
					alignment.replaceCigarString(newcigarstream.str());
					alignment.replaceSequence(modseq);
				}

				alignment.serialise(BW.bgzfos);
			}			
		}
	}
	catch(std::exception const & ex)
	{
		std::cerr << ex.what() << std::endl;
		return EXIT_FAILURE;
	}
}
Exemple #10
0
void testlz4()
{
	std::ostringstream ostr;
	
	{
		libmaus2::lz::Lz4CompressStream compressor(ostr,16*1024);
		libmaus2::aio::CheckedInputStream CIS("configure");
		int c;
		while ( (c=CIS.get()) > 0 )
			compressor.put(c);
		compressor.writeIndex();
	}

	libmaus2::autoarray::AutoArray<char> const C = libmaus2::autoarray::AutoArray<char>::readFile("configure");

	std::istringstream istr(ostr.str());
	libmaus2::lz::Lz4Decoder dec(istr);
	
	{

		for ( uint64_t i = 0; i < C.size(); i += 100 )
		{
			if ( i % 16 == 0 )
				std::cerr << "i=" <<i << std::endl;
		
			int c;
			dec.clear();
			dec.seekg(i);
			uint64_t j = i;
			while ( (c=dec.get()) > 0 )
			{
				assert ( c == static_cast<uint8_t>(C[j++]) );
			}
		}
			
		uint64_t i = C.size()-1;
		int c;
		dec.clear();
		dec.seekg(i);
		uint64_t j = i;
		while ( (c=dec.get()) > 0 )
		{
			assert ( c == static_cast<uint8_t>(C[j++]) );
		}
	}
	
	libmaus2::random::Random::setup(time(0));
	
	dec.clear();
	for ( uint64_t j = 0; j < 16384; ++j )
	{
		uint64_t const r = 10;
		uint64_t const p = libmaus2::random::Random::rand64() % ( C.size()-r );
		
		dec.seekg(p);
		for ( uint64_t i = 0; i < r; ++i )
		{
			assert ( dec.get() == static_cast<uint8_t>(C[p+i]) );
		}
	}
}
Exemple #11
0
void testcompact()
{
	std::string const fn("tmpfile");
	#if 0
	std::string const fn2("tmpfile2");
	std::string const fnm("tmpfile.merged");
	#endif

	::libmaus::util::TempFileRemovalContainer::setup();
	::libmaus::util::TempFileRemovalContainer::addTempFile(fn);
	
	uint64_t n = 1024*1024;
	unsigned int const b = 3;
	::libmaus::bitio::CompactArray CA(n,b);
	::libmaus::bitio::CompactArrayWriter CAW(fn,n,b);
	srand(time(0));
	for ( uint64_t i = 0; i < n; ++i )
	{
		CA.set(i,rand() & ((1ull<<b)-1));
		CAW.put(CA.get(i));
	}
	CAW.flush();
	#if 0
	::libmaus::aio::CheckedOutputStream COS(fn);
	CA.serialize(COS);
	COS.flush();
	COS.close();
	#endif
	
	::libmaus::aio::CheckedInputStream CIS(fn);
	std::cerr << "compact file size is " << ::libmaus::util::GetFileSize::getFileSize(CIS) << std::endl;
	assert ( static_cast< ::std::streampos > (CIS.tellg()) == static_cast< ::std::streampos >(0) );
	assert ( CIS.get() >= 0 );
	
	::libmaus::bitio::CompactDecoderWrapper W(fn,4096);
	
	W.seekg(0,std::ios::end);
	int64_t const fs = W.tellg();
	W.seekg(0,std::ios::beg);
	W.clear();
	
	assert ( fs == static_cast<int64_t>(n) );
	
	std::cerr << "n=" << n << " fs=" << fs << std::endl;
	
	for ( uint64_t i = 0; i < n; ++i )
	{
		assert ( W.tellg() == static_cast< ::std::streampos >(i) );
		int const v = W.get();
		assert ( v == static_cast<int>(CA[i]) );
		// std::cerr << static_cast<int>(W.get()) << " " << CA[i] << std::endl;
	}
	
	for ( uint64_t i = 0; i < n; i += (rand() % 256) )
	{
		W.clear();
		W.seekg(i);
		
		std::cerr << "seek to " << W.tellg() << std::endl;
		
		for ( uint64_t j = i; j < n; ++j )
		{
			assert ( W.tellg() == static_cast< ::std::streampos >(j) );
			int const v = W.get();
			assert ( v == static_cast<int>(CA[j]) );
		}
		
		uint64_t ii = n-i;
		W.clear();
		W.seekg(ii);

		for ( uint64_t j = ii; j < n; ++j )
		{
			assert ( W.tellg() == static_cast< ::std::streampos >(j) );
			int const v = W.get();
			assert ( v == static_cast<int>(CA[j]) );
		}
	}
}
Exemple #12
0
int main(int argc, char * argv[])
{
	try
	{
		libmaus2::util::ArgInfo const arginfo(argc,argv);
		libmaus2::timing::RealTimeClock rtc;
		uint64_t const runs = 10;
		std::pair <libmaus2::bambam::BamAlignment const *, libmaus2::bambam::BamAlignment const *> P;
		
		for ( uint64_t i = 0; i < arginfo.restargs.size(); ++i )
		{
			std::string const fn = arginfo.restargs[i];
			double srate = 0, drate = 0;
	
			for ( uint64_t j = 0; j < runs; ++j )		
			{
				rtc.start();
				libmaus2::bambam::BamDecoder bamdec(fn);
				uint64_t cnt = 0;
				while ( bamdec.readAlignment() )
					++cnt;
				double const lela = rtc.getElapsedSeconds();
				
				std::cerr << "[S] " << "cnt=" << cnt << " ela=" << lela 
					<< " rate=" << cnt/lela << std::endl;
					
				srate += cnt/lela;
			}

			for ( uint64_t j = 0; j < runs; ++j )		
			{
				rtc.start();
				libmaus2::aio::InputStreamInstance CIS(fn);
				libmaus2::bambam::BamCircularHashCollatingBamDecoder bamdec(CIS,"tmpfile");
				uint64_t cnt = 0;
				while ( bamdec.tryPair(P) )
				{
					if ( P.first )
						++cnt;
					if ( P.second )
						++cnt;
				}
				libmaus2::aio::FileRemoval::removeFile("tmpfile");
				double const lela = rtc.getElapsedSeconds();
				
				std::cerr << "[D] " << "cnt=" << cnt << " ela=" << lela 
					<< " rate=" << cnt/lela << std::endl;
					
				drate += cnt/lela;
			}
			
			srate /= runs;
			drate /= runs;
			std::cerr << "[Q] " << srate/drate << std::endl;
		}
	}
	catch(std::exception const & ex)
	{
		std::cerr << ex.what() << std::endl;
		return EXIT_FAILURE;
	}
}
			static unique_ptr_type loadFile(std::string const & filename)
			{
				libmaus::aio::CheckedInputStream CIS(filename);
				unique_ptr_type ptr(load(CIS));
				return UNIQUE_PTR_MOVE(ptr);
			}
Exemple #14
0
			/**
			 * load a serialised vector of FileFragments from file named filename
			 *
			 * @param filename name of file containing a serialised file fragment vector
			 * @return deserialised vector
			 **/
			static std::vector<FileFragment> loadVector(std::string const & filename)
			{
				libmaus2::aio::CheckedInputStream CIS(filename);
			        std::vector<FileFragment> V = deserialiseVector(CIS);
			        return V;
			}
Exemple #15
0
int bamreset(::libmaus2::util::ArgInfo const & arginfo)
{
	if ( isatty(STDIN_FILENO) )
	{
		::libmaus2::exception::LibMausException se;
		se.getStream() << "Refusing to read binary data from terminal, please redirect standard input to pipe or file." << std::endl;
		se.finish();
		throw se;
	}

	if ( isatty(STDOUT_FILENO) )
	{
		::libmaus2::exception::LibMausException se;
		se.getStream() << "Refusing write binary data to terminal, please redirect standard output to pipe or file." << std::endl;
		se.finish();
		throw se;
	}
	
	int const level = libmaus2::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue<int>("level",getDefaultLevel()));
	int const verbose = arginfo.getValue<int>("verbose",getDefaultVerbose());
	int const resetsortorder = arginfo.getValue<int>("resetsortorder",getDefaultResetSortOrder());
	
	::libmaus2::bambam::BamDecoder dec(std::cin,false);
	::libmaus2::bambam::BamHeader const & header = dec.getHeader();

	std::string headertext = header.text;

	// no replacement header file given
	if ( ! arginfo.hasArg("resetheadertext") )
	{
		// remove SQ lines
		std::vector<libmaus2::bambam::HeaderLine> allheaderlines = libmaus2::bambam::HeaderLine::extractLines(headertext);

		std::ostringstream upheadstr;
		for ( uint64_t i = 0; i < allheaderlines.size(); ++i )
			if ( allheaderlines[i].type != "SQ" )
				upheadstr << allheaderlines[i].line << std::endl;

		headertext = upheadstr.str();
	}
	// replace header given in file
	else
	{
		std::string const headerfilename = arginfo.getUnparsedValue("resetheadertext","");
		uint64_t const headerlen = libmaus2::util::GetFileSize::getFileSize(headerfilename);
		libmaus2::aio::CheckedInputStream CIS(headerfilename);
		libmaus2::autoarray::AutoArray<char> ctext(headerlen,false);
		CIS.read(ctext.begin(),headerlen);
		headertext = std::string(ctext.begin(),ctext.end());		
	}

	// add PG line to header
	headertext = libmaus2::bambam::ProgramHeaderLineSet::addProgramLine(
		headertext,
		"bamreset", // ID
		"bamreset", // PN
		arginfo.commandline, // CL
		::libmaus2::bambam::ProgramHeaderLineSet(headertext).getLastIdInChain(), // PP
		std::string(PACKAGE_VERSION) // VN			
	);
	
	// construct new header
	libmaus2::bambam::BamHeader uphead(headertext);
	if ( resetsortorder )
		uphead.changeSortOrder("unknown");

	/*
	 * start index/md5 callbacks
	 */
	std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName());
	std::string const tmpfileindex = tmpfilenamebase + "_index";
	::libmaus2::util::TempFileRemovalContainer::addTempFile(tmpfileindex);
	uint32_t const excludeflags = libmaus2::bambam::BamFlagBase::stringToFlags(
		arginfo.getValue<std::string>("exclude",getDefaultExcludeFlags()));

	std::string md5filename;
	std::string indexfilename;

	std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > cbs;
	::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb;
	if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) )
	{
		if ( arginfo.hasArg("md5filename") &&  arginfo.getUnparsedValue("md5filename","") != "" )
			md5filename = arginfo.getUnparsedValue("md5filename","");
		else
			std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl;

		if ( md5filename.size() )
		{
			::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus2::lz::BgzfDeflateOutputCallbackMD5);
			Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb);
			cbs.push_back(Pmd5cb.get());
		}
	}
	libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex;
	if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) )
	{
		if ( arginfo.hasArg("indexfilename") &&  arginfo.getUnparsedValue("indexfilename","") != "" )
			indexfilename = arginfo.getUnparsedValue("indexfilename","");
		else
			std::cerr << "[V] no filename for index given, not creating index" << std::endl;

		if ( indexfilename.size() )
		{
			libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex));
			Pindex = UNIQUE_PTR_MOVE(Tindex);
			cbs.push_back(Pindex.get());
		}
	}
	std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > * Pcbs = 0;
	if ( cbs.size() )
		Pcbs = &cbs;
	/*
	 * end md5/index callbacks
	 */

	::libmaus2::bambam::BamWriter::unique_ptr_type writer(new ::libmaus2::bambam::BamWriter(std::cout,uphead,level,Pcbs));
 	libmaus2::timing::RealTimeClock rtc; rtc.start();
 	
	libmaus2::bambam::BamAlignment & algn = dec.getAlignment();
	uint64_t c = 0;

	bool const resetaux = arginfo.getValue<int>("resetaux",getDefaultResetAux());
	libmaus2::bambam::BamAuxFilterVector::unique_ptr_type const prgfilter(libmaus2::bambam::BamAuxFilterVector::parseAuxFilterList(arginfo));
	libmaus2::bambam::BamAuxFilterVector const * rgfilter = prgfilter.get();

	while ( dec.readAlignment() )
	{
		bool const keep = resetAlignment(algn,resetaux /* reset aux */,excludeflags,rgfilter);
		
		if ( keep )
			algn.serialise(writer->getStream());

		if ( verbose && (++c & (1024*1024-1)) == 0 )
 			std::cerr << "[V] " << c/(1024*1024) << " " << (c / rtc.getElapsedSeconds()) << std::endl;
	}
	
	writer.reset();

	if ( Pmd5cb )
	{
		Pmd5cb->saveDigestAsFile(md5filename);
	}
	if ( Pindex )
	{
		Pindex->flush(std::string(indexfilename));
	}

	return EXIT_SUCCESS;
}
Exemple #16
0
int fagzToCompact4(libmaus2::util::ArgInfo const & arginfo)
{
	bool const rc = arginfo.getValue<unsigned int>("rc",1);
	bool const gz = arginfo.getValue<unsigned int>("gz",1);

	std::vector<std::string> inputfilenames;
	inputfilenames = arginfo.restargs;

	if ( arginfo.hasArg("inputfilenames") )
	{
		std::string const inf = arginfo.getUnparsedValue("inputfilenames",std::string());
		libmaus2::aio::InputStream::unique_ptr_type Pinf(libmaus2::aio::InputStreamFactoryContainer::constructUnique(inf));
		while ( *Pinf )
		{
			std::string line;
			std::getline(*Pinf,line);
			if ( line.size() )
				inputfilenames.push_back(line);
		}
	}

	std::string const inlcp = libmaus2::util::OutputFileNameTools::lcp(inputfilenames);
	std::string defout = inlcp;
	defout = libmaus2::util::OutputFileNameTools::clipOff(defout,".gz");
	defout = libmaus2::util::OutputFileNameTools::clipOff(defout,".fasta");
	defout = libmaus2::util::OutputFileNameTools::clipOff(defout,".fa");

	std::string const outputfilename = arginfo.getUnparsedValue("outputfilename",defout + ".compact");
	std::string const metaoutputfilename = outputfilename + ".meta";
	int const verbose = arginfo.getValue<int>("verbose",1);
	libmaus2::autoarray::AutoArray<char> B(8*1024,false);
	libmaus2::bitio::CompactArrayWriterFile compactout(outputfilename,2 /* bits per symbol */);

	if ( ! rc )
		std::cerr << "[V] not storing reverse complements" << std::endl;

	// forward mapping table
	libmaus2::autoarray::AutoArray<uint8_t> ftable(256,false);
	// rc mapping for mapped symbols
	libmaus2::autoarray::AutoArray<uint8_t> ctable(256,false);

	std::fill(ftable.begin(),ftable.end(),4);
	std::fill(ctable.begin(),ctable.end(),4);
	ftable['a'] = ftable['A'] = 0;
	ftable['c'] = ftable['C'] = 1;
	ftable['g'] = ftable['G'] = 2;
	ftable['t'] = ftable['T'] = 3;
	uint64_t insize = 0;

	ctable[0] = 3; // A->T
	ctable[1] = 2; // C->G
	ctable[2] = 1; // G->C
	ctable[3] = 0; // T->A

	libmaus2::aio::OutputStreamInstance::unique_ptr_type metaOut(new libmaus2::aio::OutputStreamInstance(metaoutputfilename));
	libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,0);
	uint64_t nseq = 0;
	std::vector<uint64_t> lvec;

	for ( uint64_t i = 0; i < inputfilenames.size(); ++i )
	{
		std::string const fn = inputfilenames[i];
		libmaus2::aio::InputStreamInstance CIS(fn);
		libmaus2::lz::BufferedGzipStream::unique_ptr_type BGS;
		std::istream * istr = 0;
		if ( gz )
		{
			libmaus2::lz::BufferedGzipStream::unique_ptr_type tBGS(
				new libmaus2::lz::BufferedGzipStream(CIS));
			BGS = UNIQUE_PTR_MOVE(tBGS);
			istr = BGS.get();
		}
		else
		{
			istr = &CIS;
		}
		libmaus2::fastx::StreamFastAReaderWrapper fain(*istr);
		libmaus2::fastx::StreamFastAReaderWrapper::pattern_type pattern;

		while ( fain.getNextPatternUnlocked(pattern) )
		{
			if ( verbose )
				std::cerr << (i+1) << " " << stripAfterDot(basename(fn)) << " " << pattern.sid << "...";

			libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,pattern.spattern.size());
			lvec.push_back(pattern.spattern.size());
			libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,0);

			// map symbols
			for ( uint64_t j = 0; j < pattern.spattern.size(); ++j )
				pattern.spattern[j] = ftable[static_cast<uint8_t>(pattern.spattern[j])];

			// replace blocks of N symbols by random bases
			uint64_t l = 0;
			// number of replaced blocks
			uint64_t nr = 0;
			while ( l < pattern.spattern.size() )
			{
				// skip regular bases
				while ( l < pattern.spattern.size() && pattern.spattern[l] < 4 )
					++l;
				assert ( l == pattern.spattern.size() || pattern.spattern[l] == 4 );

				// go to end of non regular bases block
				uint64_t h = l;
				while ( h < pattern.spattern.size() && pattern.spattern[h] == 4 )
					++h;

				// if non regular block is not empty
				if ( h-l )
				{
					// replace by random bases
					for ( uint64_t j = l; j < h; ++j )
						pattern.spattern[j] = (libmaus2::random::Random::rand8() & 3);

					// write bounds
					libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,l);
					libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,h);
					// add to interval counter
					nr += 1;
				}

				l = h;
			}

			// make sure there are no more irregular bases
			for ( uint64_t j = 0; j < pattern.spattern.size(); ++j )
				assert ( pattern.spattern[j] < 4 );

			// go back to start of meta data
			metaOut->seekp( - static_cast<int64_t>(2*nr+1)*sizeof(uint64_t), std::ios::cur );
			// write number of intervals replaced
			libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,nr);
			// skip interval bounds already written
			metaOut->seekp(   static_cast<int64_t>(2*nr  )*sizeof(uint64_t), std::ios::cur );

			// write bases
			compactout.write(pattern.spattern.c_str(),pattern.spattern.size());

			// write reverse complement if requested
			if ( rc )
			{
				// reverse complement
				std::reverse(pattern.spattern.begin(),pattern.spattern.end());
				for ( uint64_t j = 0; j < pattern.spattern.size(); ++j )
					pattern.spattern[j] = ctable[static_cast<uint8_t>(pattern.spattern[j])];

				// write
				compactout.write(pattern.spattern.c_str(),pattern.spattern.size());
			}

			insize += pattern.spattern.size()+1;
			nseq += 1;

			if ( verbose )
				std::cerr << "done, input size " << formatBytes(pattern.spattern.size()+1) << " acc " << formatBytes(insize) << std::endl;
		}
	}

	metaOut->seekp(0);
	libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,nseq);
	metaOut->flush();
	metaOut.reset();

	libmaus2::aio::InputStreamInstance::unique_ptr_type metaISI(new libmaus2::aio::InputStreamInstance(metaoutputfilename));
	// number of sequences
	uint64_t const rnseq = libmaus2::util::NumberSerialisation::deserialiseNumber(*metaISI);
	assert ( nseq == rnseq );
	for ( uint64_t i = 0; i < nseq; ++i )
	{
		// length of sequence
		uint64_t const l = libmaus2::util::NumberSerialisation::deserialiseNumber(*metaISI);
		assert ( l == lvec[i] );
		uint64_t const nr = libmaus2::util::NumberSerialisation::deserialiseNumber(*metaISI);
		// skip replaced intervals
		metaISI->ignore(2*nr*sizeof(uint64_t));
	}
	assert ( metaISI->peek() == std::istream::traits_type::eof() );

	std::cerr << "Done, total input size " << insize << std::endl;

	compactout.flush();

	return EXIT_SUCCESS;
}
Exemple #17
0
			/**
			 * load object from file fs and return it encapsulated in a unique pointer
			 *
			 * @param fs filename
			 * @return deserialised object as unique pointer
			 **/
			static unique_ptr_type load(std::string const & fs)
			{
				libmaus2::aio::CheckedInputStream CIS(fs);
				unique_ptr_type u(new this_type(CIS));
				return UNIQUE_PTR_MOVE(u);
			}
Exemple #18
0
int main(int argc, char * argv[])
{
	try
	{
		::libmaus::util::ArgInfo arginfo(argc,argv);
		::libmaus::util::TempFileRemovalContainer::setup();
		::std::vector<std::string> const & inputfilenames = arginfo.restargs;
		char const * fasuffixes[] = { ".fa", ".fasta", 0 };
		
		std::string deftmpname = libmaus::util::OutputFileNameTools::endClipLcp(inputfilenames,&fasuffixes[0]) + ".fa.tmp";
		while ( ::libmaus::util::GetFileSize::fileExists(deftmpname) )
			deftmpname += "_";
		std::string defoutname = libmaus::util::OutputFileNameTools::endClipLcp(inputfilenames,&fasuffixes[0]) + ".fa.recoded";
		while ( ::libmaus::util::GetFileSize::fileExists(defoutname) )
			defoutname += "_";

		std::string const tempfilename = arginfo.getValue<std::string>("tempfilename",deftmpname);
		std::string const outfilename = arginfo.getValue<std::string>("outputfilename",defoutname);
		std::string const indexfilename = tempfilename + ".index";
		unsigned int const addterm = arginfo.getValue<unsigned int>("addterm",0);
		unsigned int const termadd = addterm ? 1 : 0;

		::libmaus::util::TempFileRemovalContainer::addTempFile(tempfilename);
		::libmaus::util::TempFileRemovalContainer::addTempFile(indexfilename);
		
		std::cerr << "temp file name " << tempfilename << std::endl;
		std::cerr << "output file name " << outfilename << std::endl;
		
		/* uint64_t const numseq = */ ::libmaus::fastx::FastAReader::rewriteFiles(inputfilenames,tempfilename,indexfilename);
		uint64_t curpos = 0;
		::libmaus::aio::CheckedOutputStream COS(outfilename);
		
		// 0,A,C,G,T,N
		// map forward
		::libmaus::autoarray::AutoArray<char> cmap(256,false);
		std::fill(cmap.begin(),cmap.end(),5+termadd);
		cmap['\n'] = 0 + termadd;
		cmap['a'] = cmap['A'] = 1 + termadd;
		cmap['c'] = cmap['C'] = 2 + termadd;
		cmap['g'] = cmap['G'] = 3 + termadd;
		cmap['t'] = cmap['T'] = 4 + termadd;
		cmap['n'] = cmap['N'] = 5 + termadd;

		// map to reverse complement
		::libmaus::autoarray::AutoArray<char> rmap(256,false);
		std::fill(rmap.begin(),rmap.end(),5+termadd);
		rmap['\n'] = 0 + termadd;
		rmap['a'] = rmap['A'] = 4 + termadd;
		rmap['c'] = rmap['C'] = 3 + termadd;
		rmap['g'] = rmap['G'] = 2 + termadd;
		rmap['t'] = rmap['T'] = 1 + termadd;
		rmap['n'] = rmap['N'] = 5 + termadd;

		// reverse complement for mapped data
		::libmaus::autoarray::AutoArray<char> xmap(256,false);
		std::fill(xmap.begin(),xmap.end(),5+termadd);
		xmap[0] = 0 + termadd;
		xmap[1] = 4 + termadd;
		xmap[2] = 3 + termadd;
		xmap[3] = 2 + termadd;
		xmap[4] = 1 + termadd;
		xmap[5] = 5 + termadd;

		::libmaus::autoarray::AutoArray<char> imap(256,false);
		for ( uint64_t i = 0; i < imap.size(); ++i )
			imap[i] = static_cast<char>(i);
		
		::libmaus::fastx::FastAReader::RewriteInfoDecoder::unique_ptr_type infodec(new ::libmaus::fastx::FastAReader::RewriteInfoDecoder(indexfilename));
		::libmaus::fastx::FastAReader::RewriteInfo info;
		uint64_t maxseqlen = 0;
		while ( infodec->get(info) )
			maxseqlen = std::max(maxseqlen,info.seqlen);
			
		std::cerr << "[V] max seq len " << maxseqlen << std::endl;

		::libmaus::fastx::FastAReader::RewriteInfoDecoder::unique_ptr_type tinfodec(new ::libmaus::fastx::FastAReader::RewriteInfoDecoder(indexfilename));
		infodec = UNIQUE_PTR_MOVE(tinfodec);
		
		if ( maxseqlen <= 256*1024 )
		{
			::libmaus::aio::CheckedInputStream CIS(tempfilename);
			::libmaus::autoarray::AutoArray<uint8_t> B(maxseqlen+1,false);

			while ( infodec->get(info) )
			{
				// skip id
				CIS.ignore(info.idlen+2);
				// read sequence plus following terminator
				CIS.read(reinterpret_cast<char *>(B.begin()), info.seqlen+1);
				// map
				for ( uint64_t i = 0; i < info.seqlen+1; ++i )
					B[i] = cmap[B[i]];
				// write
				COS.write(reinterpret_cast<char const *>(B.begin()),info.seqlen+1);
				// remap
				for ( uint64_t i = 0; i < info.seqlen+1; ++i )
					B[i] = xmap[B[i]];
				// reverse
				std::reverse(B.begin(),B.begin()+info.seqlen);
				// write
				COS.write(reinterpret_cast<char const *>(B.begin()),info.seqlen+1);
			}
		}
		else
		{
			while ( infodec->get(info) )
			{
				// std::cerr << info.valid << "\t" << info.idlen << "\t" << info.seqlen << "\t" << info.getIdPrefix() << std::endl;
				uint64_t const seqbeg = curpos + (info.idlen+2);
				uint64_t const seqend = seqbeg + info.seqlen;
				
				::libmaus::aio::CheckedInputStream CIS(tempfilename); CIS.seekg(seqbeg);
				::libmaus::util::GetFileSize::copyMap(CIS,COS,cmap.begin(),seqend-seqbeg+1);
				
				::libmaus::aio::CircularReverseWrapper CRW(tempfilename,seqend);
				::libmaus::util::GetFileSize::copyMap(CRW,COS,rmap.begin(),seqend-seqbeg+1);
				
				curpos += (info.idlen+2) + (info.seqlen+1);
			}		
		}
		
		if ( addterm )
			COS.put(0);

		return EXIT_SUCCESS;
	}
	catch(std::exception const & ex)
	{
		std::cerr << ex.what() << std::endl;
		return EXIT_FAILURE;
	}
}
Exemple #19
0
int main(int argc, char * argv[])
{
	try
	{
		::libmaus::util::ArgInfo const arginfo(argc,argv);
		std::string const input = arginfo.getRestArg<std::string>(0);
		std::string const output = arginfo.getRestArg<std::string>(1);
		unsigned int const verbose = arginfo.getValue<unsigned int>("verbose",1);
		unsigned int const addterm = arginfo.getValue<unsigned int>("addterm",0) ? 1 : 0;

		::libmaus::autoarray::AutoArray<uint64_t> const chist = computeCharHist(input);
		uint64_t maxsym = 0;
		for ( uint64_t i = 0; i < chist.size(); ++i )
			if ( chist[i] )
				maxsym = i;
		if ( addterm )
			maxsym += 1;
		unsigned int const b = maxsym ? (64-::libmaus::bitio::Clz::clz(maxsym)) : 0;

		uint64_t const n = std::accumulate(chist.begin(),chist.end(),0ull);
		if ( verbose )
			std::cerr << "[V] n=" << n << " maxsym=" << maxsym << " b=" << b << std::endl;				

		uint64_t const blocksize = 8*1024;
		uint64_t const numblocks = (n+blocksize-1)/blocksize;
		::libmaus::autoarray::AutoArray<uint8_t> B(blocksize);
		::libmaus::aio::CheckedInputStream CIS(input);
		::libmaus::bitio::CompactArrayWriter CAW(output,n+addterm,b);
		int64_t lastperc = -1;
		
		if ( verbose )
			std::cerr << "[V] ";
			
		for ( uint64_t b = 0; b < numblocks; ++b )
		{
			uint64_t const low = std::min(b*blocksize,n);
			uint64_t const high = std::min(low+blocksize,n);
			uint64_t const range = high-low;
			
			CIS.read ( reinterpret_cast<char *>(B.begin()), range );
			assert ( CIS.gcount() == static_cast<int64_t>(range) );
			
			if ( addterm )
				for ( uint64_t i = 0; i < range; ++i )
					B[i] += 1;
			
			CAW.write(B.begin(),range);
			
			int64_t const newperc = (high * 100) / n;
			if ( verbose && newperc != lastperc )
			{
				lastperc = newperc;
				std::cerr << "(" << newperc << ")";
			}
		}
		if ( addterm )
			CAW.put(0);
		if ( verbose )
			std::cerr << std::endl;
		
		CAW.flush();
		
		#if 0
		::libmaus::bitio::CompactDecoderWrapper CDW(output);
		for ( uint64_t i = 0; i < n+addterm; ++i )
			std::cerr << CDW.get();
		std::cerr << std::endl;
		#endif
	}
	catch(std::exception const & ex)
	{
		std::cerr << ex.what() << std::endl;
	}
}
Exemple #20
0
			static unique_ptr_type load(std::string const & filename)
			{
				libmaus2::aio::InputStreamInstance CIS(filename);
				unique_ptr_type tptr(new this_type(CIS));
				return UNIQUE_PTR_MOVE(tptr);
			}
Exemple #21
0
                        static unique_ptr_type load(lf_type const * lf, std::string const & fn)
                        {
                        	libmaus::aio::CheckedInputStream CIS(fn);
				unique_ptr_type ptr(new this_type(lf,CIS));
                        	return UNIQUE_PTR_MOVE(ptr);
                        }