Ejemplo n.º 1
0
/*************************************************
Function:
    prlRead2HashTable
Description:
    1. Imports the reads from the lib file one by one.
    2. Chops the reads into kmers and store them in KmerSets.
    3. Removes the kmers with low coverage.
    4. Marks the linear kmers.
    5. Counts the kmer frequences.
Input:
    1. libfile :            the reads config file
    2. outfile :        the output file prefix
Output:
    None.
Return:
    1 if exits normally.
*************************************************/
boolean prlRead2HashTable ( char * libfile, char * outfile )
{
	char * cach1;
	char * cach2;
	unsigned char asm_ctg = 1;
	long long i;
	char * next_name, name[256];
	FILE * fo;
	time_t start_t, stop_t;
	int maxReadNum;
	int libNo;
	pthread_t threads[thrd_num];
	unsigned char thrdSignal[thrd_num + 1];
	PARAMETER paras[thrd_num];
	boolean flag, pairs = 0;
	WORDFILTER = createFilter ( overlaplen );
	maxReadLen = 0;
	maxNameLen = 256;
	scan_libInfo ( libfile );
	alloc_pe_mem ( num_libs );

	if ( !maxReadLen )
	{
		maxReadLen = 100;
	}

	if ( gLineLen < maxReadLen )
	{
		gStr = ( char * ) ckalloc ( ( maxReadLen + 1 ) * sizeof ( char ) );
	}

	//init
	maxReadLen4all = maxReadLen;
	fprintf ( stderr, "In %s, %d lib(s), maximum read length %d, maximum name length %d.\n\n", libfile, num_libs, maxReadLen, maxNameLen );
	next_name = ( char * ) ckalloc ( ( maxNameLen + 1 ) * sizeof ( char ) );
	kmerBuffer = ( Kmer * ) ckalloc ( buffer_size * sizeof ( Kmer ) );
	hashBanBuffer = ( ubyte8 * ) ckalloc ( buffer_size * sizeof ( ubyte8 ) );
	prevcBuffer = ( char * ) ckalloc ( buffer_size * sizeof ( char ) );
	nextcBuffer = ( char * ) ckalloc ( buffer_size * sizeof ( char ) );
	maxReadNum = buffer_size / ( maxReadLen - overlaplen + 1 );
	//printf("buffer size %d, max read len %d, max read num %d\n",buffer_size,maxReadLen,maxReadNum);
	int maxAIOSize = 32768;
	aioBuffer1 = ( char * ) ckalloc ( ( maxAIOSize ) * sizeof ( char ) );
	aioBuffer2 = ( char * ) ckalloc ( ( maxAIOSize ) * sizeof ( char ) );
	readBuffer1 = ( char * ) ckalloc ( ( maxAIOSize + ( maxReadLen * 4 + 1024 ) ) * sizeof ( char ) ); //(char *)ckalloc(maxAIOSize*sizeof(char)); //1024
	readBuffer2 = ( char * ) ckalloc ( ( maxAIOSize + ( maxReadLen * 4 + 1024 ) ) * sizeof ( char ) ); //1024
	cach1 = ( char * ) ckalloc ( ( maxReadLen * 4 + 1024 ) * sizeof ( char ) ); //1024
	cach2 = ( char * ) ckalloc ( ( maxReadLen * 4 + 1024 ) * sizeof ( char ) ); //1024
	memset ( cach1, '\0', ( maxReadLen * 4 + 1024 ) ); //1024
	memset ( cach2, '\0', ( maxReadLen * 4 + 1024 ) ); //1024
	seqBuffer = ( char ** ) ckalloc ( maxReadNum * sizeof ( char * ) );
	lenBuffer = ( int * ) ckalloc ( maxReadNum * sizeof ( int ) );
	indexArray = ( int * ) ckalloc ( maxReadNum * sizeof ( int ) );

	for ( i = 0; i < maxReadNum; i++ )
	{
		seqBuffer[i] = ( char * ) ckalloc ( maxReadLen * sizeof ( char ) );
	}

	rcSeq = ( char ** ) ckalloc ( ( thrd_num + 1 ) * sizeof ( char * ) );

	if ( 1 )
	{
		kmerCounter = ( long long * ) ckalloc ( ( thrd_num + 1 ) * sizeof ( long long ) );
		KmerSets = ( KmerSet ** ) ckalloc ( thrd_num * sizeof ( KmerSet * ) );
		ubyte8 init_size = 1024;
		ubyte8 k = 0;

		if ( initKmerSetSize )
		{
#ifdef MER127
			init_size = ( ubyte8 ) ( ( double ) initKmerSetSize * 1024.0f * 1024.0f * 1024.0f / ( double ) thrd_num / 40 );
#else
			init_size = ( ubyte8 ) ( ( double ) initKmerSetSize * 1024.0f * 1024.0f * 1024.0f / ( double ) thrd_num / 24 ); //is it true?
#endif

			do
			{
				++k;
			}
			while ( k * 0xFFFFFFLLU < init_size );
		}

		for ( i = 0; i < thrd_num; i++ )
		{
			//KmerSets[i] = init_kmerset(1024,0.77f);
			KmerSets[i] = init_kmerset ( ( ( initKmerSetSize ) ? ( k * 0xFFFFFFLLU ) : ( init_size ) ), 0.77f );
			thrdSignal[i + 1] = 0;
			paras[i].threadID = i;
			paras[i].mainSignal = &thrdSignal[0];
			paras[i].selfSignal = &thrdSignal[i + 1];
			kmerCounter[i + 1] = 0;
			rcSeq[i + 1] = ( char * ) ckalloc ( maxReadLen * sizeof ( char ) );
		}

		creatThrds ( threads, paras );
	}

	thrdSignal[0] = kmerCounter[0] = 0;
	time ( &start_t );
	kmer_c = n_solexa = read_c = i = libNo = readNumBack = gradsCounter = 0;

	while ( openNextFile ( &libNo, pairs, asm_ctg ) )
	{
		//read bam file
		if ( lib_array[libNo].curr_type == 4 )
		{
			int type = 0;   //deside the PE reads is good or bad

			while ( ( flag = read1seqInLibBam ( seqBuffer[read_c], next_name, & ( lenBuffer[read_c] ), &libNo, pairs, 1, &type ) ) != 0 )
			{
				if ( type == -1 ) //if the reads is bad, go back.
				{
					i--;

					if ( lenBuffer[read_c - 1] >= overlaplen + 1 )
					{
						kmer_c -= lenBuffer[read_c - 1] - overlaplen + 1;
						read_c--;
					}

					n_solexa -= 2;
					continue;
				}

				if ( ( ++i ) % 100000000 == 0 )
					{ fprintf ( stderr, "--- %lldth reads.\n", i ); }

				if ( lenBuffer[read_c] < 0 )
					{ fprintf ( stderr, "Read len %d.\n", lenBuffer[read_c] ); }

				if ( lenBuffer[read_c] < overlaplen + 1 )
					{ continue; }

				/*
				   if(lenBuffer[read_c]>70)
				   lenBuffer[read_c] = 50;
				   else if(lenBuffer[read_c]>40)
				   lenBuffer[read_c] = 40;
				 */
				indexArray[read_c] = kmer_c;
				kmer_c += lenBuffer[read_c] - overlaplen + 1;
				read_c++;

				if ( read_c == maxReadNum )
				{
					kmerCounter[0] += kmer_c;
					sendWorkSignal ( 2, thrdSignal ); //chopKmer4read
					sendWorkSignal ( 1, thrdSignal ); //singleKmer
					kmer_c = read_c = 0;
				}
			}
		}
		//read PE fasta or fastq
		else if ( lib_array[libNo].curr_type == 1 || lib_array[libNo].curr_type == 2 )
		{
			initAIO ( &aio1, aioBuffer1, fileno ( lib_array[libNo].fp1 ), maxAIOSize );
			initAIO ( &aio2, aioBuffer2, fileno ( lib_array[libNo].fp2 ), maxAIOSize );
			int offset1, offset2, flag1, flag2, rt1, rt2;
			offset1 = offset2 = 0;
			rt1 = aio_read ( &aio1 );
			rt2 = aio_read ( &aio2 );
			flag1 = AIORead ( &aio1, &offset1, readBuffer1, cach1, &rt1, lib_array[libNo].curr_type );
			flag2 = AIORead ( &aio2, &offset2, readBuffer2, cach2, &rt2, lib_array[libNo].curr_type );

			if ( flag1 && flag2 )
			{
				int start1, start2, turn;
				start1 = start2 = 0;
				turn = 1;

				while ( start1 < offset1 || start2 < offset2 )
				{
					if ( turn == 1 )
					{
						turn = 2;
						readseqInLib ( seqBuffer[read_c], next_name, & ( lenBuffer[read_c] ), readBuffer1, &start1, offset1, libNo );

						if ( ( ++i ) % 100000000 == 0 )
							{ fprintf ( stderr, "--- %lldth reads.\n", i ); }

						if ( lenBuffer[read_c] < 0 )
							{ fprintf ( stderr, "Read len %d.\n", lenBuffer[read_c] ); }

						if ( lenBuffer[read_c] < overlaplen + 1 )
						{
							if ( start1 >= offset1 )
							{
								start1 = 0;
								offset1 = 0;
								flag1 = AIORead ( &aio1, &offset1, readBuffer1, cach1, &rt1, lib_array[libNo].curr_type );
							}

							continue;
						}

						indexArray[read_c] = kmer_c;
						kmer_c += lenBuffer[read_c] - overlaplen + 1;
						read_c++;

						if ( start1 >= offset1 )
						{
							start1 = 0;
							offset1 = 0;
							flag1 = AIORead ( &aio1, &offset1, readBuffer1, cach1, &rt1, lib_array[libNo].curr_type );
						}

						if ( read_c == maxReadNum )
						{
							kmerCounter[0] += kmer_c;
							sendWorkSignal ( 2, thrdSignal );   //chopKmer4read
							sendWorkSignal ( 1, thrdSignal );   //singleKmer
							kmer_c = read_c = 0;
						}

						continue;
					}

					if ( turn == 2 )
					{
						turn = 1;
						readseqInLib ( seqBuffer[read_c], next_name, & ( lenBuffer[read_c] ), readBuffer2, &start2, offset2, libNo );

						if ( ( ++i ) % 100000000 == 0 )
							{ fprintf ( stderr, "--- %lldth reads.\n", i ); }

						if ( lenBuffer[read_c] < 0 )
							{ fprintf ( stderr, "Read len %d.\n", lenBuffer[read_c] ); }

						if ( lenBuffer[read_c] < overlaplen + 1 )
						{
							if ( ( flag2 == 2 ) && ( start2 >= offset2 ) )
								{ break; }

							if ( start2 >= offset2 )
							{
								start2 = 0;
								offset2 = 0;
								flag2 = AIORead ( &aio2, &offset2, readBuffer2, cach2, &rt2, lib_array[libNo].curr_type );
							}

							continue;
						}

						indexArray[read_c] = kmer_c;
						kmer_c += lenBuffer[read_c] - overlaplen + 1;
						read_c++;

						if ( ( flag2 == 2 ) && ( start2 >= offset2 ) )
							{ break; }

						if ( start2 >= offset2 )
						{
							start2 = 0;
							offset2 = 0;
							flag2 = AIORead ( &aio2, &offset2, readBuffer2, cach2, &rt2, lib_array[libNo].curr_type );
						}

						if ( read_c == maxReadNum )
						{
							kmerCounter[0] += kmer_c;
							sendWorkSignal ( 2, thrdSignal );   //chopKmer4read
							sendWorkSignal ( 1, thrdSignal );   //singleKmer
							kmer_c = read_c = 0;
						}

						continue;
					}
				}
			}
			else
			{
				fprintf(stderr, "Error: aio_read error.\n");
			}
		}
		//read single fasta, single fastq and PE fasta in one file
		else
		{
			initAIO ( &aio1, aioBuffer1, fileno ( lib_array[libNo].fp1 ), maxAIOSize );
			int offset, flag1, rt;
			offset = 0;
			rt = aio_read ( &aio1 );

			while ( ( flag1 = AIORead ( &aio1, &offset, readBuffer1, cach1, &rt, lib_array[libNo].curr_type ) ) )
			{
				int start = 0;

				while ( start < offset )
				{
					readseqInLib ( seqBuffer[read_c], next_name, & ( lenBuffer[read_c] ), readBuffer1, &start, offset, libNo );

					if ( ( ++i ) % 100000000 == 0 )
						{ fprintf ( stderr, "--- %lldth reads.\n", i ); }

					if ( lenBuffer[read_c] < 0 )
						{ fprintf ( stderr, "Read len %d.\n", lenBuffer[read_c] ); }

					if ( lenBuffer[read_c] < overlaplen + 1 )
						{ continue; }

					indexArray[read_c] = kmer_c;
					kmer_c += lenBuffer[read_c] - overlaplen + 1;
					read_c++;
				}

				if ( read_c > maxReadNum - 1024 )
				{
					kmerCounter[0] += kmer_c;
					sendWorkSignal ( 2, thrdSignal );   //chopKmer4read
					sendWorkSignal ( 1, thrdSignal );   //singleKmer
					kmer_c = read_c = 0;
				}

				if ( flag1 == 2 )
					{ break; }
			}
		}
	}

	if ( read_c )
	{
		kmerCounter[0] += kmer_c;
		sendWorkSignal ( 2, thrdSignal );   //chopKmer4read
		sendWorkSignal ( 1, thrdSignal );   //singleKmer
	}

	time ( &stop_t );
	fprintf ( stderr, "Time spent on hashing reads: %ds, %lld read(s) processed.\n", ( int ) ( stop_t - start_t ), i );

	//record insert size info
	if ( pairs )
	{
		if ( gradsCounter )
			{ fprintf ( stderr, "%d pe insert size, the largest boundary is %lld.\n\n", gradsCounter, pes[gradsCounter - 1].PE_bound ); }
		else
		{
			fprintf ( stderr, "No paired reads found.\n" );
		}

		sprintf ( name, "%s.peGrads", outfile );
		fo = ckopen ( name, "w" );
		fprintf ( fo, "grads&num: %d\t%lld\n", gradsCounter, n_solexa );

		for ( i = 0; i < gradsCounter; i++ )
		{
			fprintf ( fo, "%d\t%lld\t%d\n", pes[i].insertS, pes[i].PE_bound, pes[i].rank );
		}

		fclose ( fo );
	}

	free_pe_mem ();
	free_libs ();

	if ( 1 )
	{
		unsigned long long alloCounter = 0;
		unsigned long long allKmerCounter = 0;

		for ( i = 0; i < thrd_num; i++ )
		{
			alloCounter += count_kmerset ( ( KmerSets[i] ) );
			allKmerCounter += kmerCounter[i + 1];
			free ( ( void * ) rcSeq[i + 1] );
		}

		fprintf ( stderr, "%lli node(s) allocated, %lli kmer(s) in reads, %lli kmer(s) processed.\n", alloCounter, kmerCounter[0], allKmerCounter );
	}

	free ( ( void * ) rcSeq );
	free ( ( void * ) kmerCounter );

	for ( i = 0; i < maxReadNum; i++ )
	{
		free ( ( void * ) seqBuffer[i] );
	}

	free ( ( void * ) seqBuffer );
	free ( ( void * ) lenBuffer );
	free ( ( void * ) indexArray );
	free ( ( void * ) kmerBuffer );
	free ( ( void * ) hashBanBuffer );
	free ( ( void * ) nextcBuffer );
	free ( ( void * ) prevcBuffer );
	free ( ( void * ) next_name );
	free ( ( void * ) aioBuffer1 );
	free ( ( void * ) aioBuffer2 );
	free ( ( void * ) readBuffer1 );
	free ( ( void * ) readBuffer2 );
	free ( ( void * ) cach1 );
	free ( ( void * ) cach2 );
	fprintf ( stderr, "done hashing nodes\n" );

	if ( deLowKmer )
	{
		time ( &start_t );
		deLowCov ( thrdSignal );
		time ( &stop_t );
		fprintf ( stderr, "Time spent on delowcvgNode: %ds.\n", ( int ) ( stop_t - start_t ) );
	}

	time ( &start_t );
	Mark1in1outNode ( thrdSignal );
	freqStat ( outfile );
	time ( &stop_t );
	fprintf ( stderr, "Time spent on marking linear nodes: %ds.\n", ( int ) ( stop_t - start_t ) );
	sendWorkSignal ( 3, thrdSignal );   //exit
	thread_wait ( threads );
	return 1;
}
Ejemplo n.º 2
0
boolean prlRead2HashTable ( char * libfile, char * outfile )
{
	long long i;
	char * next_name, name[256];
	FILE * fo;
	time_t start_t, stop_t;
	int maxReadNum;
	int libNo;
	pthread_t threads[thrd_num];
	unsigned char thrdSignal[thrd_num + 1];
	PARAMETER paras[thrd_num];
	boolean flag, pairs = 0;
	WORDFILTER = createFilter ( overlaplen );
	maxReadLen = 0;
	maxNameLen = 256;
	scan_libInfo ( libfile );
	alloc_pe_mem ( num_libs );

	if ( !maxReadLen )
		{ maxReadLen = 100; }

	maxReadLen4all = maxReadLen;
	printf ( "In %s, %d libs, max seq len %d, max name len %d\n\n",
	         libfile, num_libs, maxReadLen, maxNameLen );
	next_name = ( char * ) ckalloc ( ( maxNameLen + 1 ) * sizeof ( char ) );
	kmerBuffer = ( Kmer * ) ckalloc ( buffer_size * sizeof ( Kmer ) );
	hashBanBuffer = ( ubyte8 * ) ckalloc ( buffer_size * sizeof ( ubyte8 ) );
	prevcBuffer = ( char * ) ckalloc ( buffer_size * sizeof ( char ) );
	nextcBuffer = ( char * ) ckalloc ( buffer_size * sizeof ( char ) );
	maxReadNum = buffer_size / ( maxReadLen - overlaplen + 1 );
	//printf("buffer size %d, max read len %d, max read num %d\n",buffer_size,maxReadLen,maxReadNum);
	seqBuffer = ( char ** ) ckalloc ( maxReadNum * sizeof ( char * ) );
	lenBuffer = ( int * ) ckalloc ( maxReadNum * sizeof ( int ) );
	indexArray = ( int * ) ckalloc ( maxReadNum * sizeof ( int ) );

	for ( i = 0; i < maxReadNum; i++ )
		{ seqBuffer[i] = ( char * ) ckalloc ( maxReadLen * sizeof ( char ) ); }

	rcSeq = ( char ** ) ckalloc ( ( thrd_num + 1 ) * sizeof ( char * ) );

	if ( 1 )
	{
		kmerCounter = ( long long * ) ckalloc ( ( thrd_num + 1 ) * sizeof ( long long ) );
		KmerSets = ( KmerSet ** ) ckalloc ( thrd_num * sizeof ( KmerSet * ) );
		ubyte8  init_size = 1024;
		ubyte8 k = 0;

		if ( initKmerSetSize )
		{
			init_size = ( ubyte8 ) ( ( double ) initKmerSetSize * 1024.0f * 1024.0f * 1024.0f / ( double ) thrd_num / 32 );

			do
			{
				++k;
			}
			while ( k * 0xFFFFFFLLU < init_size );
		}

		for ( i = 0; i < thrd_num; i++ )
		{
			//KmerSets[i] = init_kmerset(1024,0.77f);
			KmerSets[i] = init_kmerset ( k * 0xFFFFFFLLU, 0.77f );
			thrdSignal[i + 1] = 0;
			paras[i].threadID = i;
			paras[i].mainSignal = &thrdSignal[0];
			paras[i].selfSignal = &thrdSignal[i + 1];
			kmerCounter[i + 1] = 0;
			rcSeq[i + 1] = ( char * ) ckalloc ( maxReadLen * sizeof ( char ) );
		}

		creatThrds ( threads, paras );
	}

	thrdSignal[0] = kmerCounter[0] = 0;
	time ( &start_t );
	kmer_c = n_solexa = read_c = i = libNo = readNumBack = gradsCounter = 0;

	while ( ( flag = read1seqInLib ( seqBuffer[read_c], next_name, & ( lenBuffer[read_c] ), &libNo, pairs, 1 ) ) != 0 )
	{
		if ( ( ++i ) % 100000000 == 0 )
			{ printf ( "--- %lldth reads\n", i ); }

		if ( lenBuffer[read_c] < 0 )
			{ printf ( "read len %d\n", lenBuffer[read_c] ); }

		if ( lenBuffer[read_c] < overlaplen + 1 )
			{ continue; }

		/*
		if(lenBuffer[read_c]>70)
		    lenBuffer[read_c] = 50;
		else if(lenBuffer[read_c]>40)
		    lenBuffer[read_c] = 40;
		*/
		indexArray[read_c] = kmer_c;
		kmer_c += lenBuffer[read_c] - overlaplen + 1;
		read_c++;

		if ( read_c == maxReadNum )
		{
			kmerCounter[0] += kmer_c;
			sendWorkSignal ( 2, thrdSignal );
			sendWorkSignal ( 1, thrdSignal );
			kmer_c = read_c = 0;
		}
	}

	if ( read_c )
	{
		kmerCounter[0] += kmer_c;
		sendWorkSignal ( 2, thrdSignal );
		sendWorkSignal ( 1, thrdSignal );
	}

	time ( &stop_t );
	printf ( "time spent on hash reads: %ds, %lld reads processed\n", ( int ) ( stop_t - start_t ), i );

	//record insert size info
	if ( pairs )
	{
		if ( gradsCounter )
			printf ( "%d pe insert size, the largest boundary is %lld\n\n",
			         gradsCounter, pes[gradsCounter - 1].PE_bound );
		else
			{ printf ( "no paired reads found\n" ); }

		sprintf ( name, "%s.peGrads", outfile );
		fo = ckopen ( name, "w" );
		fprintf ( fo, "grads&num: %d\t%lld\n", gradsCounter, n_solexa );

		for ( i = 0; i < gradsCounter; i++ )
			{ fprintf ( fo, "%d\t%lld\t%d\n", pes[i].insertS, pes[i].PE_bound, pes[i].rank ); }

		fclose ( fo );
	}

	free_pe_mem();
	free_libs();

	if ( 1 )
	{
		unsigned long long alloCounter = 0;
		unsigned long long allKmerCounter = 0;

		for ( i = 0; i < thrd_num; i++ )
		{
			alloCounter += count_kmerset ( ( KmerSets[i] ) );
			allKmerCounter += kmerCounter[i + 1];
			free ( ( void * ) rcSeq[i + 1] );
		}

		printf ( "%lli nodes allocated, %lli kmer in reads, %lli kmer processed\n"
		         , alloCounter, kmerCounter[0], allKmerCounter );
	}

	free ( ( void * ) rcSeq );
	free ( ( void * ) kmerCounter );

	for ( i = 0; i < maxReadNum; i++ )
		{ free ( ( void * ) seqBuffer[i] ); }

	free ( ( void * ) seqBuffer );
	free ( ( void * ) lenBuffer );
	free ( ( void * ) indexArray );
	free ( ( void * ) kmerBuffer );
	free ( ( void * ) hashBanBuffer );
	free ( ( void * ) nextcBuffer );
	free ( ( void * ) prevcBuffer );
	free ( ( void * ) next_name );

	//printf("done hashing nodes\n");
	if ( deLowKmer )
	{
		time ( &start_t );
		deLowCov ( thrdSignal );
		time ( &stop_t );
		printf ( "time spent on delowcvgNode %ds\n", ( int ) ( stop_t - start_t ) );
	}

	time ( &start_t );
	Mark1in1outNode ( thrdSignal );
	freqStat ( outfile );
	time ( &stop_t );
	printf ( "time spent on marking linear nodes %ds\n", ( int ) ( stop_t - start_t ) );
	fflush ( stdout );
	sendWorkSignal ( 3, thrdSignal );
	thread_wait ( threads );
	/*
	    Kmer word = 0x21c3ca82c734c8d0;
	    Kmer hash_ban = hash_kmer(word);
	    int setPicker = hash_ban%thrd_num;
	    kmer_t *node;
	    boolean found = search_kmerset(KmerSets[setPicker], word, &node);
	    if(!found)
	        printf("kmer %llx not found,\n",word);
	    else{
	        printf("kmer %llx, linear %d\n",word,node->linear);
	        for(i=0;i<4;i++){
	            if(get_kmer_right_cov(*node,i)>0)
	                printf("right %d, kmer %llx\n",i,nextKmer(node->seq,i));
	            if(get_kmer_left_cov(*node,i)>0)
	                printf("left %d, kmer %llx\n",i,prevKmer(node->seq,i));
	        }

	    }
	*/
	return 1;
}