Example #1
0
int call_scaffold (int argc, char **argv)
{
	time_t start_t, stop_t, time_bef, time_aft;

	time (&start_t);
	initenv (argc, argv);
	loadPEgrads (graphfile);
	time (&time_bef);
	loadUpdatedEdges (graphfile);
	time (&time_aft);
	printf ("time spent on loading edges %ds\n", (int) (time_aft - time_bef));

	if (!SCAFF)
	{
		time (&time_bef);
		PE2Links (graphfile);
		time (&time_aft);
		printf ("time spent on loading pair end info %ds\n\n", (int) (time_aft - time_bef));
		
		time (&time_bef);
		Links2Scaf (graphfile);
		time (&time_aft);
		printf ("time spent on creating scaffolds %ds\n\n", (int) (time_aft - time_bef));
		
		time(&time_bef);
		transcriptome(graphfile);
		time(&time_aft);
		printf("time spent on creating transcriptome %ds\n",(int)(time_aft-time_bef));
//		scaffolding (100, graphfile);
	}

	prlReadsCloseGap (graphfile);

	//  locateReadOnScaf(graphfile);

	ScafStat (100, graphfile);
	if(read_trace)
	{
		getReadOnScaf(graphfile);
		if(RPKM)   //Must add  '-R'  parameter    RPKMStat(graphfile);
			RPKMStat(graphfile);
	}
	
	free_pe_mem ();

	if (index_array)
	{
		free ((void *) index_array);
	}

	freeContig_array ();
	destroyPreArcMem ();
	destroyConnectMem ();
	deleteCntLookupTable ();
	time (&stop_t);
	printf ("time elapsed: %dm\n", (int) (stop_t - start_t) / 60);
	return 0;
}
Example #2
0
/*************************************************
Function:
    prlRead2HashTable
Description:
    1. Imports the reads from the lib file one by one.
    2. Chops the reads into kmers and store them in KmerSets.
    3. Removes the kmers with low coverage.
    4. Marks the linear kmers.
    5. Counts the kmer frequences.
Input:
    1. libfile :            the reads config file
    2. outfile :        the output file prefix
Output:
    None.
Return:
    1 if exits normally.
*************************************************/
boolean prlRead2HashTable ( char * libfile, char * outfile )
{
	char * cach1;
	char * cach2;
	unsigned char asm_ctg = 1;
	long long i;
	char * next_name, name[256];
	FILE * fo;
	time_t start_t, stop_t;
	int maxReadNum;
	int libNo;
	pthread_t threads[thrd_num];
	unsigned char thrdSignal[thrd_num + 1];
	PARAMETER paras[thrd_num];
	boolean flag, pairs = 0;
	WORDFILTER = createFilter ( overlaplen );
	maxReadLen = 0;
	maxNameLen = 256;
	scan_libInfo ( libfile );
	alloc_pe_mem ( num_libs );

	if ( !maxReadLen )
	{
		maxReadLen = 100;
	}

	if ( gLineLen < maxReadLen )
	{
		gStr = ( char * ) ckalloc ( ( maxReadLen + 1 ) * sizeof ( char ) );
	}

	//init
	maxReadLen4all = maxReadLen;
	fprintf ( stderr, "In %s, %d lib(s), maximum read length %d, maximum name length %d.\n\n", libfile, num_libs, maxReadLen, maxNameLen );
	next_name = ( char * ) ckalloc ( ( maxNameLen + 1 ) * sizeof ( char ) );
	kmerBuffer = ( Kmer * ) ckalloc ( buffer_size * sizeof ( Kmer ) );
	hashBanBuffer = ( ubyte8 * ) ckalloc ( buffer_size * sizeof ( ubyte8 ) );
	prevcBuffer = ( char * ) ckalloc ( buffer_size * sizeof ( char ) );
	nextcBuffer = ( char * ) ckalloc ( buffer_size * sizeof ( char ) );
	maxReadNum = buffer_size / ( maxReadLen - overlaplen + 1 );
	//printf("buffer size %d, max read len %d, max read num %d\n",buffer_size,maxReadLen,maxReadNum);
	int maxAIOSize = 32768;
	aioBuffer1 = ( char * ) ckalloc ( ( maxAIOSize ) * sizeof ( char ) );
	aioBuffer2 = ( char * ) ckalloc ( ( maxAIOSize ) * sizeof ( char ) );
	readBuffer1 = ( char * ) ckalloc ( ( maxAIOSize + ( maxReadLen * 4 + 1024 ) ) * sizeof ( char ) ); //(char *)ckalloc(maxAIOSize*sizeof(char)); //1024
	readBuffer2 = ( char * ) ckalloc ( ( maxAIOSize + ( maxReadLen * 4 + 1024 ) ) * sizeof ( char ) ); //1024
	cach1 = ( char * ) ckalloc ( ( maxReadLen * 4 + 1024 ) * sizeof ( char ) ); //1024
	cach2 = ( char * ) ckalloc ( ( maxReadLen * 4 + 1024 ) * sizeof ( char ) ); //1024
	memset ( cach1, '\0', ( maxReadLen * 4 + 1024 ) ); //1024
	memset ( cach2, '\0', ( maxReadLen * 4 + 1024 ) ); //1024
	seqBuffer = ( char ** ) ckalloc ( maxReadNum * sizeof ( char * ) );
	lenBuffer = ( int * ) ckalloc ( maxReadNum * sizeof ( int ) );
	indexArray = ( int * ) ckalloc ( maxReadNum * sizeof ( int ) );

	for ( i = 0; i < maxReadNum; i++ )
	{
		seqBuffer[i] = ( char * ) ckalloc ( maxReadLen * sizeof ( char ) );
	}

	rcSeq = ( char ** ) ckalloc ( ( thrd_num + 1 ) * sizeof ( char * ) );

	if ( 1 )
	{
		kmerCounter = ( long long * ) ckalloc ( ( thrd_num + 1 ) * sizeof ( long long ) );
		KmerSets = ( KmerSet ** ) ckalloc ( thrd_num * sizeof ( KmerSet * ) );
		ubyte8 init_size = 1024;
		ubyte8 k = 0;

		if ( initKmerSetSize )
		{
#ifdef MER127
			init_size = ( ubyte8 ) ( ( double ) initKmerSetSize * 1024.0f * 1024.0f * 1024.0f / ( double ) thrd_num / 40 );
#else
			init_size = ( ubyte8 ) ( ( double ) initKmerSetSize * 1024.0f * 1024.0f * 1024.0f / ( double ) thrd_num / 24 ); //is it true?
#endif

			do
			{
				++k;
			}
			while ( k * 0xFFFFFFLLU < init_size );
		}

		for ( i = 0; i < thrd_num; i++ )
		{
			//KmerSets[i] = init_kmerset(1024,0.77f);
			KmerSets[i] = init_kmerset ( ( ( initKmerSetSize ) ? ( k * 0xFFFFFFLLU ) : ( init_size ) ), 0.77f );
			thrdSignal[i + 1] = 0;
			paras[i].threadID = i;
			paras[i].mainSignal = &thrdSignal[0];
			paras[i].selfSignal = &thrdSignal[i + 1];
			kmerCounter[i + 1] = 0;
			rcSeq[i + 1] = ( char * ) ckalloc ( maxReadLen * sizeof ( char ) );
		}

		creatThrds ( threads, paras );
	}

	thrdSignal[0] = kmerCounter[0] = 0;
	time ( &start_t );
	kmer_c = n_solexa = read_c = i = libNo = readNumBack = gradsCounter = 0;

	while ( openNextFile ( &libNo, pairs, asm_ctg ) )
	{
		//read bam file
		if ( lib_array[libNo].curr_type == 4 )
		{
			int type = 0;   //deside the PE reads is good or bad

			while ( ( flag = read1seqInLibBam ( seqBuffer[read_c], next_name, & ( lenBuffer[read_c] ), &libNo, pairs, 1, &type ) ) != 0 )
			{
				if ( type == -1 ) //if the reads is bad, go back.
				{
					i--;

					if ( lenBuffer[read_c - 1] >= overlaplen + 1 )
					{
						kmer_c -= lenBuffer[read_c - 1] - overlaplen + 1;
						read_c--;
					}

					n_solexa -= 2;
					continue;
				}

				if ( ( ++i ) % 100000000 == 0 )
					{ fprintf ( stderr, "--- %lldth reads.\n", i ); }

				if ( lenBuffer[read_c] < 0 )
					{ fprintf ( stderr, "Read len %d.\n", lenBuffer[read_c] ); }

				if ( lenBuffer[read_c] < overlaplen + 1 )
					{ continue; }

				/*
				   if(lenBuffer[read_c]>70)
				   lenBuffer[read_c] = 50;
				   else if(lenBuffer[read_c]>40)
				   lenBuffer[read_c] = 40;
				 */
				indexArray[read_c] = kmer_c;
				kmer_c += lenBuffer[read_c] - overlaplen + 1;
				read_c++;

				if ( read_c == maxReadNum )
				{
					kmerCounter[0] += kmer_c;
					sendWorkSignal ( 2, thrdSignal ); //chopKmer4read
					sendWorkSignal ( 1, thrdSignal ); //singleKmer
					kmer_c = read_c = 0;
				}
			}
		}
		//read PE fasta or fastq
		else if ( lib_array[libNo].curr_type == 1 || lib_array[libNo].curr_type == 2 )
		{
			initAIO ( &aio1, aioBuffer1, fileno ( lib_array[libNo].fp1 ), maxAIOSize );
			initAIO ( &aio2, aioBuffer2, fileno ( lib_array[libNo].fp2 ), maxAIOSize );
			int offset1, offset2, flag1, flag2, rt1, rt2;
			offset1 = offset2 = 0;
			rt1 = aio_read ( &aio1 );
			rt2 = aio_read ( &aio2 );
			flag1 = AIORead ( &aio1, &offset1, readBuffer1, cach1, &rt1, lib_array[libNo].curr_type );
			flag2 = AIORead ( &aio2, &offset2, readBuffer2, cach2, &rt2, lib_array[libNo].curr_type );

			if ( flag1 && flag2 )
			{
				int start1, start2, turn;
				start1 = start2 = 0;
				turn = 1;

				while ( start1 < offset1 || start2 < offset2 )
				{
					if ( turn == 1 )
					{
						turn = 2;
						readseqInLib ( seqBuffer[read_c], next_name, & ( lenBuffer[read_c] ), readBuffer1, &start1, offset1, libNo );

						if ( ( ++i ) % 100000000 == 0 )
							{ fprintf ( stderr, "--- %lldth reads.\n", i ); }

						if ( lenBuffer[read_c] < 0 )
							{ fprintf ( stderr, "Read len %d.\n", lenBuffer[read_c] ); }

						if ( lenBuffer[read_c] < overlaplen + 1 )
						{
							if ( start1 >= offset1 )
							{
								start1 = 0;
								offset1 = 0;
								flag1 = AIORead ( &aio1, &offset1, readBuffer1, cach1, &rt1, lib_array[libNo].curr_type );
							}

							continue;
						}

						indexArray[read_c] = kmer_c;
						kmer_c += lenBuffer[read_c] - overlaplen + 1;
						read_c++;

						if ( start1 >= offset1 )
						{
							start1 = 0;
							offset1 = 0;
							flag1 = AIORead ( &aio1, &offset1, readBuffer1, cach1, &rt1, lib_array[libNo].curr_type );
						}

						if ( read_c == maxReadNum )
						{
							kmerCounter[0] += kmer_c;
							sendWorkSignal ( 2, thrdSignal );   //chopKmer4read
							sendWorkSignal ( 1, thrdSignal );   //singleKmer
							kmer_c = read_c = 0;
						}

						continue;
					}

					if ( turn == 2 )
					{
						turn = 1;
						readseqInLib ( seqBuffer[read_c], next_name, & ( lenBuffer[read_c] ), readBuffer2, &start2, offset2, libNo );

						if ( ( ++i ) % 100000000 == 0 )
							{ fprintf ( stderr, "--- %lldth reads.\n", i ); }

						if ( lenBuffer[read_c] < 0 )
							{ fprintf ( stderr, "Read len %d.\n", lenBuffer[read_c] ); }

						if ( lenBuffer[read_c] < overlaplen + 1 )
						{
							if ( ( flag2 == 2 ) && ( start2 >= offset2 ) )
								{ break; }

							if ( start2 >= offset2 )
							{
								start2 = 0;
								offset2 = 0;
								flag2 = AIORead ( &aio2, &offset2, readBuffer2, cach2, &rt2, lib_array[libNo].curr_type );
							}

							continue;
						}

						indexArray[read_c] = kmer_c;
						kmer_c += lenBuffer[read_c] - overlaplen + 1;
						read_c++;

						if ( ( flag2 == 2 ) && ( start2 >= offset2 ) )
							{ break; }

						if ( start2 >= offset2 )
						{
							start2 = 0;
							offset2 = 0;
							flag2 = AIORead ( &aio2, &offset2, readBuffer2, cach2, &rt2, lib_array[libNo].curr_type );
						}

						if ( read_c == maxReadNum )
						{
							kmerCounter[0] += kmer_c;
							sendWorkSignal ( 2, thrdSignal );   //chopKmer4read
							sendWorkSignal ( 1, thrdSignal );   //singleKmer
							kmer_c = read_c = 0;
						}

						continue;
					}
				}
			}
			else
			{
				fprintf(stderr, "Error: aio_read error.\n");
			}
		}
		//read single fasta, single fastq and PE fasta in one file
		else
		{
			initAIO ( &aio1, aioBuffer1, fileno ( lib_array[libNo].fp1 ), maxAIOSize );
			int offset, flag1, rt;
			offset = 0;
			rt = aio_read ( &aio1 );

			while ( ( flag1 = AIORead ( &aio1, &offset, readBuffer1, cach1, &rt, lib_array[libNo].curr_type ) ) )
			{
				int start = 0;

				while ( start < offset )
				{
					readseqInLib ( seqBuffer[read_c], next_name, & ( lenBuffer[read_c] ), readBuffer1, &start, offset, libNo );

					if ( ( ++i ) % 100000000 == 0 )
						{ fprintf ( stderr, "--- %lldth reads.\n", i ); }

					if ( lenBuffer[read_c] < 0 )
						{ fprintf ( stderr, "Read len %d.\n", lenBuffer[read_c] ); }

					if ( lenBuffer[read_c] < overlaplen + 1 )
						{ continue; }

					indexArray[read_c] = kmer_c;
					kmer_c += lenBuffer[read_c] - overlaplen + 1;
					read_c++;
				}

				if ( read_c > maxReadNum - 1024 )
				{
					kmerCounter[0] += kmer_c;
					sendWorkSignal ( 2, thrdSignal );   //chopKmer4read
					sendWorkSignal ( 1, thrdSignal );   //singleKmer
					kmer_c = read_c = 0;
				}

				if ( flag1 == 2 )
					{ break; }
			}
		}
	}

	if ( read_c )
	{
		kmerCounter[0] += kmer_c;
		sendWorkSignal ( 2, thrdSignal );   //chopKmer4read
		sendWorkSignal ( 1, thrdSignal );   //singleKmer
	}

	time ( &stop_t );
	fprintf ( stderr, "Time spent on hashing reads: %ds, %lld read(s) processed.\n", ( int ) ( stop_t - start_t ), i );

	//record insert size info
	if ( pairs )
	{
		if ( gradsCounter )
			{ fprintf ( stderr, "%d pe insert size, the largest boundary is %lld.\n\n", gradsCounter, pes[gradsCounter - 1].PE_bound ); }
		else
		{
			fprintf ( stderr, "No paired reads found.\n" );
		}

		sprintf ( name, "%s.peGrads", outfile );
		fo = ckopen ( name, "w" );
		fprintf ( fo, "grads&num: %d\t%lld\n", gradsCounter, n_solexa );

		for ( i = 0; i < gradsCounter; i++ )
		{
			fprintf ( fo, "%d\t%lld\t%d\n", pes[i].insertS, pes[i].PE_bound, pes[i].rank );
		}

		fclose ( fo );
	}

	free_pe_mem ();
	free_libs ();

	if ( 1 )
	{
		unsigned long long alloCounter = 0;
		unsigned long long allKmerCounter = 0;

		for ( i = 0; i < thrd_num; i++ )
		{
			alloCounter += count_kmerset ( ( KmerSets[i] ) );
			allKmerCounter += kmerCounter[i + 1];
			free ( ( void * ) rcSeq[i + 1] );
		}

		fprintf ( stderr, "%lli node(s) allocated, %lli kmer(s) in reads, %lli kmer(s) processed.\n", alloCounter, kmerCounter[0], allKmerCounter );
	}

	free ( ( void * ) rcSeq );
	free ( ( void * ) kmerCounter );

	for ( i = 0; i < maxReadNum; i++ )
	{
		free ( ( void * ) seqBuffer[i] );
	}

	free ( ( void * ) seqBuffer );
	free ( ( void * ) lenBuffer );
	free ( ( void * ) indexArray );
	free ( ( void * ) kmerBuffer );
	free ( ( void * ) hashBanBuffer );
	free ( ( void * ) nextcBuffer );
	free ( ( void * ) prevcBuffer );
	free ( ( void * ) next_name );
	free ( ( void * ) aioBuffer1 );
	free ( ( void * ) aioBuffer2 );
	free ( ( void * ) readBuffer1 );
	free ( ( void * ) readBuffer2 );
	free ( ( void * ) cach1 );
	free ( ( void * ) cach2 );
	fprintf ( stderr, "done hashing nodes\n" );

	if ( deLowKmer )
	{
		time ( &start_t );
		deLowCov ( thrdSignal );
		time ( &stop_t );
		fprintf ( stderr, "Time spent on delowcvgNode: %ds.\n", ( int ) ( stop_t - start_t ) );
	}

	time ( &start_t );
	Mark1in1outNode ( thrdSignal );
	freqStat ( outfile );
	time ( &stop_t );
	fprintf ( stderr, "Time spent on marking linear nodes: %ds.\n", ( int ) ( stop_t - start_t ) );
	sendWorkSignal ( 3, thrdSignal );   //exit
	thread_wait ( threads );
	return 1;
}
Example #3
0
void prlRead2edge (char *libfile, char *outfile)
{
	char *cach1;
	char *cach2;
	unsigned char asm_ctg = 1;

	long long i;
	char name[256], *src_name, *next_name;
	FILE *outfp = NULL;
	int maxReadNum, libNo;
	boolean flag, pairs = 0;
	pthread_t threads[thrd_num];
	unsigned char thrdSignal[thrd_num + 1];
	PARAMETER paras[thrd_num];

	maxReadLen = 0;
	maxNameLen = 256;
	scan_libInfo (libfile);
	alloc_pe_mem (num_libs);

	if (!maxReadLen)
	{
		maxReadLen = 100;
	}

	maxReadLen4all = maxReadLen;
	printf ("In file: %s, max seq len %d, max name len %d\n\n", libfile, maxReadLen, maxNameLen);

	if (repsTie)
	{
		sprintf (name, "%s.path", outfile);
		outfp = ckopen (name, "wb");
	}

	src_name = (char *) ckalloc ((maxNameLen + 1) * sizeof (char));
	next_name = (char *) ckalloc ((10*maxNameLen + 1) * sizeof (char));
	kmerBuffer = (Kmer *) ckalloc (buffer_size * sizeof (Kmer));
	mixBuffer = (Kmer *) ckalloc (buffer_size * sizeof (Kmer));
	hashBanBuffer = (ubyte8 *) ckalloc (buffer_size * sizeof (ubyte8));
	nodeBuffer = (kmer_t **) ckalloc (buffer_size * sizeof (kmer_t *));
	smallerBuffer = (boolean *) ckalloc (buffer_size * sizeof (boolean));
	flagArray = (boolean *) ckalloc (buffer_size * sizeof (boolean));
	maxReadNum = buffer_size / (maxReadLen - overlaplen + 1);
	//printf("buffer for at most %d reads\n",maxReadNum);
	
	int maxAIOSize = 32768;/*
	aioBuffer1 = (char *) ckalloc ((maxAIOSize) * sizeof (char));
	aioBuffer2 = (char *) ckalloc ((maxAIOSize) * sizeof (char));
	readBuffer1 = (char *) ckalloc ((maxAIOSize + 1024) * sizeof (char));	//(char *)ckalloc(maxAIOSize*sizeof(char));
	readBuffer2 = (char *) ckalloc ((maxAIOSize + 1024) * sizeof (char));
	cach1 = (char *) ckalloc (1024 * sizeof (char));
	cach2 = (char *) ckalloc (1024 * sizeof (char));
	memset(cach1,'\0',1024);
	memset(cach2,'\0',1024);*/
        aioBuffer1 = (char *) ckalloc ((maxAIOSize) * sizeof (char));
        aioBuffer2 = (char *) ckalloc ((maxAIOSize) * sizeof (char));
        readBuffer1 = (char *) ckalloc ((maxAIOSize + (maxReadLen+1024)) * sizeof (char));      //(char *)ckalloc(maxAIOSize*sizeof(char));     //1024
        readBuffer2 = (char *) ckalloc ((maxAIOSize + (maxReadLen+1024)) * sizeof (char));      //1024
        cach1 = (char *) ckalloc ((maxReadLen+1024) * sizeof (char));   //1024
        cach2 = (char *) ckalloc ((maxReadLen+1024) * sizeof (char));   //1024
        memset(cach1,'\0',(maxReadLen+1024));   //1024
        memset(cach2,'\0',(maxReadLen+1024));   //1024


	seqBuffer = (char **) ckalloc (maxReadNum * sizeof (char *));
	lenBuffer = (int *) ckalloc (maxReadNum * sizeof (int));
	indexArray = (int *) ckalloc ((maxReadNum + 1) * sizeof (int));

	for (i = 0; i < maxReadNum; i++)
	{
		seqBuffer[i] = (char *) ckalloc (maxReadLen * sizeof (char));
	}

	memoAlloc4preArc ();
	flags = (char **) ckalloc ((thrd_num + 1) * sizeof (char *));
	deletion = (int *) ckalloc ((thrd_num + 1) * sizeof (int));
	rcSeq = (char **) ckalloc ((thrd_num + 1) * sizeof (char *));

	if (repsTie)
	{
		markerOnEdge = (unsigned char *) ckalloc ((num_ed + 1) * sizeof (unsigned char));

		for (i = 1; i <= num_ed; i++)
		{
			markerOnEdge[i] = 0;
		}

		fwriteBuf = (unsigned int *) ckalloc ((maxReadLen - overlaplen + 1) * sizeof (unsigned int));
	}

	thrdSignal[0] = 0;

	if (1)
	{
		preArc_mem_managers = (MEM_MANAGER **) ckalloc (thrd_num * sizeof (MEM_MANAGER *));
		arcCounters = (unsigned int *) ckalloc (thrd_num * sizeof (unsigned int));

		for (i = 0; i < thrd_num; i++)
		{
			arcCounters[i] = 0;
			preArc_mem_managers[i] = createMem_manager (preARCBLOCKSIZE, sizeof (preARC));
			deletion[i + 1] = 0;
			flags[i + 1] = (char *) ckalloc (2 * maxReadLen * sizeof (char));
			rcSeq[i + 1] = (char *) ckalloc (maxReadLen * sizeof (char));
			thrdSignal[i + 1] = 0;
			paras[i].threadID = i;
			paras[i].mainSignal = &thrdSignal[0];
			paras[i].selfSignal = &thrdSignal[i + 1];
		}

		creatThrds (threads, paras);
	}

	if (1)
	{
		deletion[0] = 0;
		flags[0] = (char *) ckalloc (2 * maxReadLen * sizeof (char));
		rcSeq[0] = (char *) ckalloc (maxReadLen * sizeof (char));
	}

	kmer_c = n_solexa = read_c = i = libNo = readNumBack = gradsCounter = 0;
	int t0, t1, t2, t3, t4, t5, t6;

	t0 = t1 = t2 = t3 = t4 = t5 = t6 = 0;
	time_t read_start, read_end, time_bef, time_aft;

	time (&read_start);
	
		while (openNextFile (&libNo, pairs, asm_ctg))
	{
		if (lib_array[libNo].curr_type == 4)
		{
			int type = 0;	//deside the PE reads is good or bad

			while ((flag = read1seqInLibBam (seqBuffer[read_c], next_name, &(lenBuffer[read_c]), &libNo, pairs, 1, &type)) != 0)
			{
				if (type == -1)	//if the reads is bad, go back.
				{
					i--;
					if (lenBuffer[read_c - 1] >= overlaplen + 1)
					{
						kmer_c -= lenBuffer[read_c - 1] - overlaplen + 1;
						read_c--;
					}
					n_solexa -= 2;
					continue;
				}
				if ((++i) % 1000000 == 0)
				{
					printf ("--- %lldth reads\n", i);
				}

				if (lenBuffer[read_c] < overlaplen + 1)
				{
					continue;
				}

				//if(lenBuffer[read_c]>70)
				//    lenBuffer[read_c] = 70;
				//else if(lenBuffer[read_c]>40)
				//    lenBuffer[read_c] = 40;

				indexArray[read_c] = kmer_c;
				kmer_c += lenBuffer[read_c] - overlaplen + 1;
				read_c++;

				if (read_c == maxReadNum)
				{
					indexArray[read_c] = kmer_c;
					time (&read_end);
					t0 += read_end - read_start;
					time (&time_bef);
					sendWorkSignal (2, thrdSignal);
					time (&time_aft);
					t1 += time_aft - time_bef;
					time (&time_bef);
					sendWorkSignal (1, thrdSignal);
					time (&time_aft);
					t2 += time_aft - time_bef;
					time (&time_bef);
					sendWorkSignal (3, thrdSignal);
					time (&time_aft);
					t3 += time_aft - time_bef;
					time (&time_bef);
					sendWorkSignal (4, thrdSignal);
					time (&time_aft);
					t4 += time_aft - time_bef;
					time (&time_bef);
					sendWorkSignal (6, thrdSignal);
					time (&time_aft);
					t5 += time_aft - time_bef;
					time (&time_bef);

					//recordPreArc();
					if (repsTie)
					{
						recordPathBin (outfp);
					}

					time (&time_aft);
					t6 += time_aft - time_bef;
					//output_path(read_c,edge_no,flags,outfp);
					kmer_c = 0;
					read_c = 0;
					time (&read_start);
				}
			}
		}
		else if (lib_array[libNo].curr_type == 1 || lib_array[libNo].curr_type == 2)
		{
			initAIO (&aio1, aioBuffer1, fileno (lib_array[libNo].fp1), maxAIOSize);
			initAIO (&aio2, aioBuffer2, fileno (lib_array[libNo].fp2), maxAIOSize);
			int offset1, offset2, flag1, flag2, rt1, rt2;

			offset1 = offset2 = 0;
			rt1 = aio_read (&aio1);
			rt2 = aio_read (&aio2);
			flag1 = AIORead (&aio1, &offset1, readBuffer1, cach1, &rt1, lib_array[libNo].curr_type);
			flag2 = AIORead (&aio2, &offset2, readBuffer2, cach2, &rt2, lib_array[libNo].curr_type);
			if(flag1 && flag2)
			{
				int start1, start2, turn;

				start1 = start2 = 0;
				turn = 1;
				while (start1 < offset1 || start2 < offset2)
				{
					if (turn == 1)
					{
						turn = 2;
						readseqInLib (seqBuffer[read_c], next_name, &(lenBuffer[read_c]), readBuffer1, &start1, offset1, libNo);
						if ((++i) % 1000000 == 0)
							printf ("--- %lldth reads\n", i);
/*						if (lenBuffer[read_c] < overlaplen + 1)
							continue;*/
						if (lenBuffer[read_c] < overlaplen + 1)
						{
							if(start1>=offset1)
							{
								start1=0;
								flag1=AIORead (&aio1, &offset1, readBuffer1, cach1, &rt1, lib_array[libNo].curr_type);
							}
							continue;
						}

						indexArray[read_c] = kmer_c;
						kmer_c += lenBuffer[read_c] - overlaplen + 1;
						read_c++;
						if(start1>=offset1){
							start1=0;
							flag1=AIORead (&aio1, &offset1, readBuffer1, cach1, &rt1, lib_array[libNo].curr_type);
						}
						if (read_c == maxReadNum) {
							indexArray[read_c] = kmer_c;

							time (&read_end);
							t0 += read_end - read_start;
							time (&time_bef);
							sendWorkSignal (2, thrdSignal);
							time (&time_aft);
							t1 += time_aft - time_bef;
							time (&time_bef);
							sendWorkSignal (1, thrdSignal);
							time (&time_aft);
							t2 += time_aft - time_bef;
							time (&time_bef);
							sendWorkSignal (3, thrdSignal);
							time (&time_aft);
							t3 += time_aft - time_bef;
							time (&time_bef);
							sendWorkSignal (4, thrdSignal);
							time (&time_aft);
							t4 += time_aft - time_bef;
							time (&time_bef);
							sendWorkSignal (6, thrdSignal);
							time (&time_aft);
							t5 += time_aft - time_bef;
							time (&time_bef);

							//recordPreArc();
							if (repsTie)
								recordPathBin (outfp);
							time (&time_aft);
							t6 += time_aft - time_bef;
							//output_path(read_c,edge_no,flags,outfp);
							kmer_c = 0;
							read_c = 0;
							time (&read_start);
						}
						continue;
					}
					if (turn == 2)
					{
						turn = 1;
						readseqInLib (seqBuffer[read_c], next_name, &(lenBuffer[read_c]), readBuffer2, &start2, offset2, libNo);
						if ((++i) % 1000000 == 0)
							printf ("--- %lldth reads\n", i);
/*						if (lenBuffer[read_c] < overlaplen + 1)
							continue;*/
						if (lenBuffer[read_c] < overlaplen + 1)
						{
							if((flag2 == 2) && (start2 >= offset2))
								break;

							if(start2 >= offset2)
							{
								start2=0;
								flag2 = AIORead (&aio2, &offset2, readBuffer2, cach2, &rt2, lib_array[libNo].curr_type);
							}
							continue;
						}

						indexArray[read_c] = kmer_c;
						kmer_c += lenBuffer[read_c] - overlaplen + 1;
						read_c++;
						if((flag2 == 2) && (start2 >= offset2))
							break;
						if(start2 >= offset2){
			                        	start2=0;
							flag2 = AIORead (&aio2, &offset2, readBuffer2, cach2, &rt2, lib_array[libNo].curr_type);
						}
						if (read_c == maxReadNum){
							indexArray[read_c] = kmer_c;

							time (&read_end);
							t0 += read_end - read_start;
							time (&time_bef);
							sendWorkSignal (2, thrdSignal);
							time (&time_aft);
							t1 += time_aft - time_bef;
							time (&time_bef);
							sendWorkSignal (1, thrdSignal);
							time (&time_aft);
							t2 += time_aft - time_bef;
							time (&time_bef);
							sendWorkSignal (3, thrdSignal);
							time (&time_aft);
							t3 += time_aft - time_bef;
							time (&time_bef);
							sendWorkSignal (4, thrdSignal);
							time (&time_aft);
							t4 += time_aft - time_bef;
							time (&time_bef);
							sendWorkSignal (6, thrdSignal);
							time (&time_aft);
							t5 += time_aft - time_bef;
							time (&time_bef);

							//recordPreArc();
							if (repsTie)
								recordPathBin (outfp);
							time (&time_aft);
							t6 += time_aft - time_bef;
							//output_path(read_c,edge_no,flags,outfp);
							kmer_c = 0;
							read_c = 0;
							time (&read_start);
						}
						continue;
					}
				}
			}
		}
		else
		{
			initAIO (&aio1, aioBuffer1, fileno (lib_array[libNo].fp1), maxAIOSize);
			int offset, flag1, rt;

			offset = 0;
			rt = aio_read (&aio1);
			while ((flag1 = AIORead (&aio1, &offset, readBuffer1, cach1, &rt, lib_array[libNo].curr_type)))
			{
				int start = 0;

				while (start < offset)
				{
					readseqInLib (seqBuffer[read_c], next_name, &(lenBuffer[read_c]), readBuffer1, &start, offset, libNo);
					if ((++i) % 1000000 == 0)
						printf ("--- %lld reads\n", i);
					if (lenBuffer[read_c] < overlaplen + 1)
						continue;
					indexArray[read_c] = kmer_c;
					kmer_c += lenBuffer[read_c] - overlaplen + 1;
					read_c++;

					if (read_c > maxReadNum - 1024)
					{
						indexArray[read_c] = kmer_c;

						time (&read_end);
						t0 += read_end - read_start;
						time (&time_bef);
						sendWorkSignal (2, thrdSignal);
						time (&time_aft);
						t1 += time_aft - time_bef;
						time (&time_bef);
						sendWorkSignal (1, thrdSignal);
						time (&time_aft);
						t2 += time_aft - time_bef;
						time (&time_bef);
						sendWorkSignal (3, thrdSignal);
						time (&time_aft);
						t3 += time_aft - time_bef;
						time (&time_bef);
						sendWorkSignal (4, thrdSignal);
						time (&time_aft);
						t4 += time_aft - time_bef;
						time (&time_bef);
						sendWorkSignal (6, thrdSignal);
						time (&time_aft);
						t5 += time_aft - time_bef;
						time (&time_bef);

						//recordPreArc();
						if (repsTie)
							recordPathBin (outfp);
						time (&time_aft);
						t6 += time_aft - time_bef;
						//output_path(read_c,edge_no,flags,outfp);
						kmer_c = 0;
						read_c = 0;
						time (&read_start);
					}
				}
				if (flag1 == 2)
					break;

			}
		}
	}

	printf ("%lld reads processed\n", i);
	printf ("time %d,%d,%d,%d,%d,%d,%d\n", t0, t1, t2, t3, t4, t5, t6);

	if (read_c)
	{
		indexArray[read_c] = kmer_c;
		sendWorkSignal (2, thrdSignal);
		sendWorkSignal (1, thrdSignal);
		sendWorkSignal (3, thrdSignal);
		sendWorkSignal (4, thrdSignal);
		sendWorkSignal (6, thrdSignal);

		//recordPreArc();
		if (repsTie)
		{
			recordPathBin (outfp);
		}
	}

	printf ("%lld markers outputed\n", markCounter);
	sendWorkSignal (5, thrdSignal);
	thread_wait (threads);
	output_arcs (outfile);
	memoFree4preArc ();

	if (1)			// multi-threads
	{
		arcCounter = 0;

		for (i = 0; i < thrd_num; i++)
		{
			arcCounter += arcCounters[i];
			free ((void *) flags[i + 1]);
			deletion[0] += deletion[i + 1];
			free ((void *) rcSeq[i + 1]);
		}
	}

	if (1)
	{
		free ((void *) flags[0]);
		free ((void *) rcSeq[0]);
	}

	printf ("done mapping reads, %d reads deleted, %lld arcs created\n", deletion[0], arcCounter);

	if (repsTie)
	{
		free ((void *) markerOnEdge);
		free ((void *) fwriteBuf);
	}

	free ((void *) arcCounters);
	free ((void *) rcSeq);

	for (i = 0; i < maxReadNum; i++)
	{
		free ((void *) seqBuffer[i]);
	}

	free ((void *) seqBuffer);
	free ((void *) lenBuffer);
	free ((void *) indexArray);
	free ((void *) flags);
	free ((void *) deletion);
	free ((void *) kmerBuffer);
	free ((void *) mixBuffer);
	free ((void *) smallerBuffer);
	free ((void *) flagArray);
	free ((void *) hashBanBuffer);
	free ((void *) nodeBuffer);
	free ((void *) src_name);
	free ((void *) next_name);
	free ((void *) aioBuffer1);
    free ((void *) aioBuffer2);
    free ((void *) readBuffer1);
    free ((void *) readBuffer2);
    free ((void *) cach1);
    free ((void *) cach2);

	if (repsTie)
	{
		fclose (outfp);
	}

	free_pe_mem ();
	free_libs ();
}
Example #4
0
boolean prlRead2HashTable ( char * libfile, char * outfile )
{
	long long i;
	char * next_name, name[256];
	FILE * fo;
	time_t start_t, stop_t;
	int maxReadNum;
	int libNo;
	pthread_t threads[thrd_num];
	unsigned char thrdSignal[thrd_num + 1];
	PARAMETER paras[thrd_num];
	boolean flag, pairs = 0;
	WORDFILTER = createFilter ( overlaplen );
	maxReadLen = 0;
	maxNameLen = 256;
	scan_libInfo ( libfile );
	alloc_pe_mem ( num_libs );

	if ( !maxReadLen )
		{ maxReadLen = 100; }

	maxReadLen4all = maxReadLen;
	printf ( "In %s, %d libs, max seq len %d, max name len %d\n\n",
	         libfile, num_libs, maxReadLen, maxNameLen );
	next_name = ( char * ) ckalloc ( ( maxNameLen + 1 ) * sizeof ( char ) );
	kmerBuffer = ( Kmer * ) ckalloc ( buffer_size * sizeof ( Kmer ) );
	hashBanBuffer = ( ubyte8 * ) ckalloc ( buffer_size * sizeof ( ubyte8 ) );
	prevcBuffer = ( char * ) ckalloc ( buffer_size * sizeof ( char ) );
	nextcBuffer = ( char * ) ckalloc ( buffer_size * sizeof ( char ) );
	maxReadNum = buffer_size / ( maxReadLen - overlaplen + 1 );
	//printf("buffer size %d, max read len %d, max read num %d\n",buffer_size,maxReadLen,maxReadNum);
	seqBuffer = ( char ** ) ckalloc ( maxReadNum * sizeof ( char * ) );
	lenBuffer = ( int * ) ckalloc ( maxReadNum * sizeof ( int ) );
	indexArray = ( int * ) ckalloc ( maxReadNum * sizeof ( int ) );

	for ( i = 0; i < maxReadNum; i++ )
		{ seqBuffer[i] = ( char * ) ckalloc ( maxReadLen * sizeof ( char ) ); }

	rcSeq = ( char ** ) ckalloc ( ( thrd_num + 1 ) * sizeof ( char * ) );

	if ( 1 )
	{
		kmerCounter = ( long long * ) ckalloc ( ( thrd_num + 1 ) * sizeof ( long long ) );
		KmerSets = ( KmerSet ** ) ckalloc ( thrd_num * sizeof ( KmerSet * ) );
		ubyte8  init_size = 1024;
		ubyte8 k = 0;

		if ( initKmerSetSize )
		{
			init_size = ( ubyte8 ) ( ( double ) initKmerSetSize * 1024.0f * 1024.0f * 1024.0f / ( double ) thrd_num / 32 );

			do
			{
				++k;
			}
			while ( k * 0xFFFFFFLLU < init_size );
		}

		for ( i = 0; i < thrd_num; i++ )
		{
			//KmerSets[i] = init_kmerset(1024,0.77f);
			KmerSets[i] = init_kmerset ( k * 0xFFFFFFLLU, 0.77f );
			thrdSignal[i + 1] = 0;
			paras[i].threadID = i;
			paras[i].mainSignal = &thrdSignal[0];
			paras[i].selfSignal = &thrdSignal[i + 1];
			kmerCounter[i + 1] = 0;
			rcSeq[i + 1] = ( char * ) ckalloc ( maxReadLen * sizeof ( char ) );
		}

		creatThrds ( threads, paras );
	}

	thrdSignal[0] = kmerCounter[0] = 0;
	time ( &start_t );
	kmer_c = n_solexa = read_c = i = libNo = readNumBack = gradsCounter = 0;

	while ( ( flag = read1seqInLib ( seqBuffer[read_c], next_name, & ( lenBuffer[read_c] ), &libNo, pairs, 1 ) ) != 0 )
	{
		if ( ( ++i ) % 100000000 == 0 )
			{ printf ( "--- %lldth reads\n", i ); }

		if ( lenBuffer[read_c] < 0 )
			{ printf ( "read len %d\n", lenBuffer[read_c] ); }

		if ( lenBuffer[read_c] < overlaplen + 1 )
			{ continue; }

		/*
		if(lenBuffer[read_c]>70)
		    lenBuffer[read_c] = 50;
		else if(lenBuffer[read_c]>40)
		    lenBuffer[read_c] = 40;
		*/
		indexArray[read_c] = kmer_c;
		kmer_c += lenBuffer[read_c] - overlaplen + 1;
		read_c++;

		if ( read_c == maxReadNum )
		{
			kmerCounter[0] += kmer_c;
			sendWorkSignal ( 2, thrdSignal );
			sendWorkSignal ( 1, thrdSignal );
			kmer_c = read_c = 0;
		}
	}

	if ( read_c )
	{
		kmerCounter[0] += kmer_c;
		sendWorkSignal ( 2, thrdSignal );
		sendWorkSignal ( 1, thrdSignal );
	}

	time ( &stop_t );
	printf ( "time spent on hash reads: %ds, %lld reads processed\n", ( int ) ( stop_t - start_t ), i );

	//record insert size info
	if ( pairs )
	{
		if ( gradsCounter )
			printf ( "%d pe insert size, the largest boundary is %lld\n\n",
			         gradsCounter, pes[gradsCounter - 1].PE_bound );
		else
			{ printf ( "no paired reads found\n" ); }

		sprintf ( name, "%s.peGrads", outfile );
		fo = ckopen ( name, "w" );
		fprintf ( fo, "grads&num: %d\t%lld\n", gradsCounter, n_solexa );

		for ( i = 0; i < gradsCounter; i++ )
			{ fprintf ( fo, "%d\t%lld\t%d\n", pes[i].insertS, pes[i].PE_bound, pes[i].rank ); }

		fclose ( fo );
	}

	free_pe_mem();
	free_libs();

	if ( 1 )
	{
		unsigned long long alloCounter = 0;
		unsigned long long allKmerCounter = 0;

		for ( i = 0; i < thrd_num; i++ )
		{
			alloCounter += count_kmerset ( ( KmerSets[i] ) );
			allKmerCounter += kmerCounter[i + 1];
			free ( ( void * ) rcSeq[i + 1] );
		}

		printf ( "%lli nodes allocated, %lli kmer in reads, %lli kmer processed\n"
		         , alloCounter, kmerCounter[0], allKmerCounter );
	}

	free ( ( void * ) rcSeq );
	free ( ( void * ) kmerCounter );

	for ( i = 0; i < maxReadNum; i++ )
		{ free ( ( void * ) seqBuffer[i] ); }

	free ( ( void * ) seqBuffer );
	free ( ( void * ) lenBuffer );
	free ( ( void * ) indexArray );
	free ( ( void * ) kmerBuffer );
	free ( ( void * ) hashBanBuffer );
	free ( ( void * ) nextcBuffer );
	free ( ( void * ) prevcBuffer );
	free ( ( void * ) next_name );

	//printf("done hashing nodes\n");
	if ( deLowKmer )
	{
		time ( &start_t );
		deLowCov ( thrdSignal );
		time ( &stop_t );
		printf ( "time spent on delowcvgNode %ds\n", ( int ) ( stop_t - start_t ) );
	}

	time ( &start_t );
	Mark1in1outNode ( thrdSignal );
	freqStat ( outfile );
	time ( &stop_t );
	printf ( "time spent on marking linear nodes %ds\n", ( int ) ( stop_t - start_t ) );
	fflush ( stdout );
	sendWorkSignal ( 3, thrdSignal );
	thread_wait ( threads );
	/*
	    Kmer word = 0x21c3ca82c734c8d0;
	    Kmer hash_ban = hash_kmer(word);
	    int setPicker = hash_ban%thrd_num;
	    kmer_t *node;
	    boolean found = search_kmerset(KmerSets[setPicker], word, &node);
	    if(!found)
	        printf("kmer %llx not found,\n",word);
	    else{
	        printf("kmer %llx, linear %d\n",word,node->linear);
	        for(i=0;i<4;i++){
	            if(get_kmer_right_cov(*node,i)>0)
	                printf("right %d, kmer %llx\n",i,nextKmer(node->seq,i));
	            if(get_kmer_left_cov(*node,i)>0)
	                printf("left %d, kmer %llx\n",i,prevKmer(node->seq,i));
	        }

	    }
	*/
	return 1;
}