/************************************************* Function: prlRead2HashTable Description: 1. Imports the reads from the lib file one by one. 2. Chops the reads into kmers and store them in KmerSets. 3. Removes the kmers with low coverage. 4. Marks the linear kmers. 5. Counts the kmer frequences. Input: 1. libfile : the reads config file 2. outfile : the output file prefix Output: None. Return: 1 if exits normally. *************************************************/ boolean prlRead2HashTable ( char * libfile, char * outfile ) { char * cach1; char * cach2; unsigned char asm_ctg = 1; long long i; char * next_name, name[256]; FILE * fo; time_t start_t, stop_t; int maxReadNum; int libNo; pthread_t threads[thrd_num]; unsigned char thrdSignal[thrd_num + 1]; PARAMETER paras[thrd_num]; boolean flag, pairs = 0; WORDFILTER = createFilter ( overlaplen ); maxReadLen = 0; maxNameLen = 256; scan_libInfo ( libfile ); alloc_pe_mem ( num_libs ); if ( !maxReadLen ) { maxReadLen = 100; } if ( gLineLen < maxReadLen ) { gStr = ( char * ) ckalloc ( ( maxReadLen + 1 ) * sizeof ( char ) ); } //init maxReadLen4all = maxReadLen; fprintf ( stderr, "In %s, %d lib(s), maximum read length %d, maximum name length %d.\n\n", libfile, num_libs, maxReadLen, maxNameLen ); next_name = ( char * ) ckalloc ( ( maxNameLen + 1 ) * sizeof ( char ) ); kmerBuffer = ( Kmer * ) ckalloc ( buffer_size * sizeof ( Kmer ) ); hashBanBuffer = ( ubyte8 * ) ckalloc ( buffer_size * sizeof ( ubyte8 ) ); prevcBuffer = ( char * ) ckalloc ( buffer_size * sizeof ( char ) ); nextcBuffer = ( char * ) ckalloc ( buffer_size * sizeof ( char ) ); maxReadNum = buffer_size / ( maxReadLen - overlaplen + 1 ); //printf("buffer size %d, max read len %d, max read num %d\n",buffer_size,maxReadLen,maxReadNum); int maxAIOSize = 32768; aioBuffer1 = ( char * ) ckalloc ( ( maxAIOSize ) * sizeof ( char ) ); aioBuffer2 = ( char * ) ckalloc ( ( maxAIOSize ) * sizeof ( char ) ); readBuffer1 = ( char * ) ckalloc ( ( maxAIOSize + ( maxReadLen * 4 + 1024 ) ) * sizeof ( char ) ); //(char *)ckalloc(maxAIOSize*sizeof(char)); //1024 readBuffer2 = ( char * ) ckalloc ( ( maxAIOSize + ( maxReadLen * 4 + 1024 ) ) * sizeof ( char ) ); //1024 cach1 = ( char * ) ckalloc ( ( maxReadLen * 4 + 1024 ) * sizeof ( char ) ); //1024 cach2 = ( char * ) ckalloc ( ( maxReadLen * 4 + 1024 ) * sizeof ( char ) ); //1024 memset ( cach1, '\0', ( maxReadLen * 4 + 1024 ) ); //1024 memset ( cach2, '\0', ( maxReadLen * 4 + 1024 ) ); //1024 seqBuffer = ( char ** ) ckalloc ( maxReadNum * sizeof ( char * ) ); lenBuffer = ( int * ) ckalloc ( maxReadNum * sizeof ( int ) ); indexArray = ( int * ) ckalloc ( maxReadNum * sizeof ( int ) ); for ( i = 0; i < maxReadNum; i++ ) { seqBuffer[i] = ( char * ) ckalloc ( maxReadLen * sizeof ( char ) ); } rcSeq = ( char ** ) ckalloc ( ( thrd_num + 1 ) * sizeof ( char * ) ); if ( 1 ) { kmerCounter = ( long long * ) ckalloc ( ( thrd_num + 1 ) * sizeof ( long long ) ); KmerSets = ( KmerSet ** ) ckalloc ( thrd_num * sizeof ( KmerSet * ) ); ubyte8 init_size = 1024; ubyte8 k = 0; if ( initKmerSetSize ) { #ifdef MER127 init_size = ( ubyte8 ) ( ( double ) initKmerSetSize * 1024.0f * 1024.0f * 1024.0f / ( double ) thrd_num / 40 ); #else init_size = ( ubyte8 ) ( ( double ) initKmerSetSize * 1024.0f * 1024.0f * 1024.0f / ( double ) thrd_num / 24 ); //is it true? #endif do { ++k; } while ( k * 0xFFFFFFLLU < init_size ); } for ( i = 0; i < thrd_num; i++ ) { //KmerSets[i] = init_kmerset(1024,0.77f); KmerSets[i] = init_kmerset ( ( ( initKmerSetSize ) ? ( k * 0xFFFFFFLLU ) : ( init_size ) ), 0.77f ); thrdSignal[i + 1] = 0; paras[i].threadID = i; paras[i].mainSignal = &thrdSignal[0]; paras[i].selfSignal = &thrdSignal[i + 1]; kmerCounter[i + 1] = 0; rcSeq[i + 1] = ( char * ) ckalloc ( maxReadLen * sizeof ( char ) ); } creatThrds ( threads, paras ); } thrdSignal[0] = kmerCounter[0] = 0; time ( &start_t ); kmer_c = n_solexa = read_c = i = libNo = readNumBack = gradsCounter = 0; while ( openNextFile ( &libNo, pairs, asm_ctg ) ) { //read bam file if ( lib_array[libNo].curr_type == 4 ) { int type = 0; //deside the PE reads is good or bad while ( ( flag = read1seqInLibBam ( seqBuffer[read_c], next_name, & ( lenBuffer[read_c] ), &libNo, pairs, 1, &type ) ) != 0 ) { if ( type == -1 ) //if the reads is bad, go back. { i--; if ( lenBuffer[read_c - 1] >= overlaplen + 1 ) { kmer_c -= lenBuffer[read_c - 1] - overlaplen + 1; read_c--; } n_solexa -= 2; continue; } if ( ( ++i ) % 100000000 == 0 ) { fprintf ( stderr, "--- %lldth reads.\n", i ); } if ( lenBuffer[read_c] < 0 ) { fprintf ( stderr, "Read len %d.\n", lenBuffer[read_c] ); } if ( lenBuffer[read_c] < overlaplen + 1 ) { continue; } /* if(lenBuffer[read_c]>70) lenBuffer[read_c] = 50; else if(lenBuffer[read_c]>40) lenBuffer[read_c] = 40; */ indexArray[read_c] = kmer_c; kmer_c += lenBuffer[read_c] - overlaplen + 1; read_c++; if ( read_c == maxReadNum ) { kmerCounter[0] += kmer_c; sendWorkSignal ( 2, thrdSignal ); //chopKmer4read sendWorkSignal ( 1, thrdSignal ); //singleKmer kmer_c = read_c = 0; } } } //read PE fasta or fastq else if ( lib_array[libNo].curr_type == 1 || lib_array[libNo].curr_type == 2 ) { initAIO ( &aio1, aioBuffer1, fileno ( lib_array[libNo].fp1 ), maxAIOSize ); initAIO ( &aio2, aioBuffer2, fileno ( lib_array[libNo].fp2 ), maxAIOSize ); int offset1, offset2, flag1, flag2, rt1, rt2; offset1 = offset2 = 0; rt1 = aio_read ( &aio1 ); rt2 = aio_read ( &aio2 ); flag1 = AIORead ( &aio1, &offset1, readBuffer1, cach1, &rt1, lib_array[libNo].curr_type ); flag2 = AIORead ( &aio2, &offset2, readBuffer2, cach2, &rt2, lib_array[libNo].curr_type ); if ( flag1 && flag2 ) { int start1, start2, turn; start1 = start2 = 0; turn = 1; while ( start1 < offset1 || start2 < offset2 ) { if ( turn == 1 ) { turn = 2; readseqInLib ( seqBuffer[read_c], next_name, & ( lenBuffer[read_c] ), readBuffer1, &start1, offset1, libNo ); if ( ( ++i ) % 100000000 == 0 ) { fprintf ( stderr, "--- %lldth reads.\n", i ); } if ( lenBuffer[read_c] < 0 ) { fprintf ( stderr, "Read len %d.\n", lenBuffer[read_c] ); } if ( lenBuffer[read_c] < overlaplen + 1 ) { if ( start1 >= offset1 ) { start1 = 0; offset1 = 0; flag1 = AIORead ( &aio1, &offset1, readBuffer1, cach1, &rt1, lib_array[libNo].curr_type ); } continue; } indexArray[read_c] = kmer_c; kmer_c += lenBuffer[read_c] - overlaplen + 1; read_c++; if ( start1 >= offset1 ) { start1 = 0; offset1 = 0; flag1 = AIORead ( &aio1, &offset1, readBuffer1, cach1, &rt1, lib_array[libNo].curr_type ); } if ( read_c == maxReadNum ) { kmerCounter[0] += kmer_c; sendWorkSignal ( 2, thrdSignal ); //chopKmer4read sendWorkSignal ( 1, thrdSignal ); //singleKmer kmer_c = read_c = 0; } continue; } if ( turn == 2 ) { turn = 1; readseqInLib ( seqBuffer[read_c], next_name, & ( lenBuffer[read_c] ), readBuffer2, &start2, offset2, libNo ); if ( ( ++i ) % 100000000 == 0 ) { fprintf ( stderr, "--- %lldth reads.\n", i ); } if ( lenBuffer[read_c] < 0 ) { fprintf ( stderr, "Read len %d.\n", lenBuffer[read_c] ); } if ( lenBuffer[read_c] < overlaplen + 1 ) { if ( ( flag2 == 2 ) && ( start2 >= offset2 ) ) { break; } if ( start2 >= offset2 ) { start2 = 0; offset2 = 0; flag2 = AIORead ( &aio2, &offset2, readBuffer2, cach2, &rt2, lib_array[libNo].curr_type ); } continue; } indexArray[read_c] = kmer_c; kmer_c += lenBuffer[read_c] - overlaplen + 1; read_c++; if ( ( flag2 == 2 ) && ( start2 >= offset2 ) ) { break; } if ( start2 >= offset2 ) { start2 = 0; offset2 = 0; flag2 = AIORead ( &aio2, &offset2, readBuffer2, cach2, &rt2, lib_array[libNo].curr_type ); } if ( read_c == maxReadNum ) { kmerCounter[0] += kmer_c; sendWorkSignal ( 2, thrdSignal ); //chopKmer4read sendWorkSignal ( 1, thrdSignal ); //singleKmer kmer_c = read_c = 0; } continue; } } } else { fprintf(stderr, "Error: aio_read error.\n"); } } //read single fasta, single fastq and PE fasta in one file else { initAIO ( &aio1, aioBuffer1, fileno ( lib_array[libNo].fp1 ), maxAIOSize ); int offset, flag1, rt; offset = 0; rt = aio_read ( &aio1 ); while ( ( flag1 = AIORead ( &aio1, &offset, readBuffer1, cach1, &rt, lib_array[libNo].curr_type ) ) ) { int start = 0; while ( start < offset ) { readseqInLib ( seqBuffer[read_c], next_name, & ( lenBuffer[read_c] ), readBuffer1, &start, offset, libNo ); if ( ( ++i ) % 100000000 == 0 ) { fprintf ( stderr, "--- %lldth reads.\n", i ); } if ( lenBuffer[read_c] < 0 ) { fprintf ( stderr, "Read len %d.\n", lenBuffer[read_c] ); } if ( lenBuffer[read_c] < overlaplen + 1 ) { continue; } indexArray[read_c] = kmer_c; kmer_c += lenBuffer[read_c] - overlaplen + 1; read_c++; } if ( read_c > maxReadNum - 1024 ) { kmerCounter[0] += kmer_c; sendWorkSignal ( 2, thrdSignal ); //chopKmer4read sendWorkSignal ( 1, thrdSignal ); //singleKmer kmer_c = read_c = 0; } if ( flag1 == 2 ) { break; } } } } if ( read_c ) { kmerCounter[0] += kmer_c; sendWorkSignal ( 2, thrdSignal ); //chopKmer4read sendWorkSignal ( 1, thrdSignal ); //singleKmer } time ( &stop_t ); fprintf ( stderr, "Time spent on hashing reads: %ds, %lld read(s) processed.\n", ( int ) ( stop_t - start_t ), i ); //record insert size info if ( pairs ) { if ( gradsCounter ) { fprintf ( stderr, "%d pe insert size, the largest boundary is %lld.\n\n", gradsCounter, pes[gradsCounter - 1].PE_bound ); } else { fprintf ( stderr, "No paired reads found.\n" ); } sprintf ( name, "%s.peGrads", outfile ); fo = ckopen ( name, "w" ); fprintf ( fo, "grads&num: %d\t%lld\n", gradsCounter, n_solexa ); for ( i = 0; i < gradsCounter; i++ ) { fprintf ( fo, "%d\t%lld\t%d\n", pes[i].insertS, pes[i].PE_bound, pes[i].rank ); } fclose ( fo ); } free_pe_mem (); free_libs (); if ( 1 ) { unsigned long long alloCounter = 0; unsigned long long allKmerCounter = 0; for ( i = 0; i < thrd_num; i++ ) { alloCounter += count_kmerset ( ( KmerSets[i] ) ); allKmerCounter += kmerCounter[i + 1]; free ( ( void * ) rcSeq[i + 1] ); } fprintf ( stderr, "%lli node(s) allocated, %lli kmer(s) in reads, %lli kmer(s) processed.\n", alloCounter, kmerCounter[0], allKmerCounter ); } free ( ( void * ) rcSeq ); free ( ( void * ) kmerCounter ); for ( i = 0; i < maxReadNum; i++ ) { free ( ( void * ) seqBuffer[i] ); } free ( ( void * ) seqBuffer ); free ( ( void * ) lenBuffer ); free ( ( void * ) indexArray ); free ( ( void * ) kmerBuffer ); free ( ( void * ) hashBanBuffer ); free ( ( void * ) nextcBuffer ); free ( ( void * ) prevcBuffer ); free ( ( void * ) next_name ); free ( ( void * ) aioBuffer1 ); free ( ( void * ) aioBuffer2 ); free ( ( void * ) readBuffer1 ); free ( ( void * ) readBuffer2 ); free ( ( void * ) cach1 ); free ( ( void * ) cach2 ); fprintf ( stderr, "done hashing nodes\n" ); if ( deLowKmer ) { time ( &start_t ); deLowCov ( thrdSignal ); time ( &stop_t ); fprintf ( stderr, "Time spent on delowcvgNode: %ds.\n", ( int ) ( stop_t - start_t ) ); } time ( &start_t ); Mark1in1outNode ( thrdSignal ); freqStat ( outfile ); time ( &stop_t ); fprintf ( stderr, "Time spent on marking linear nodes: %ds.\n", ( int ) ( stop_t - start_t ) ); sendWorkSignal ( 3, thrdSignal ); //exit thread_wait ( threads ); return 1; }
boolean prlRead2HashTable ( char * libfile, char * outfile ) { long long i; char * next_name, name[256]; FILE * fo; time_t start_t, stop_t; int maxReadNum; int libNo; pthread_t threads[thrd_num]; unsigned char thrdSignal[thrd_num + 1]; PARAMETER paras[thrd_num]; boolean flag, pairs = 0; WORDFILTER = createFilter ( overlaplen ); maxReadLen = 0; maxNameLen = 256; scan_libInfo ( libfile ); alloc_pe_mem ( num_libs ); if ( !maxReadLen ) { maxReadLen = 100; } maxReadLen4all = maxReadLen; printf ( "In %s, %d libs, max seq len %d, max name len %d\n\n", libfile, num_libs, maxReadLen, maxNameLen ); next_name = ( char * ) ckalloc ( ( maxNameLen + 1 ) * sizeof ( char ) ); kmerBuffer = ( Kmer * ) ckalloc ( buffer_size * sizeof ( Kmer ) ); hashBanBuffer = ( ubyte8 * ) ckalloc ( buffer_size * sizeof ( ubyte8 ) ); prevcBuffer = ( char * ) ckalloc ( buffer_size * sizeof ( char ) ); nextcBuffer = ( char * ) ckalloc ( buffer_size * sizeof ( char ) ); maxReadNum = buffer_size / ( maxReadLen - overlaplen + 1 ); //printf("buffer size %d, max read len %d, max read num %d\n",buffer_size,maxReadLen,maxReadNum); seqBuffer = ( char ** ) ckalloc ( maxReadNum * sizeof ( char * ) ); lenBuffer = ( int * ) ckalloc ( maxReadNum * sizeof ( int ) ); indexArray = ( int * ) ckalloc ( maxReadNum * sizeof ( int ) ); for ( i = 0; i < maxReadNum; i++ ) { seqBuffer[i] = ( char * ) ckalloc ( maxReadLen * sizeof ( char ) ); } rcSeq = ( char ** ) ckalloc ( ( thrd_num + 1 ) * sizeof ( char * ) ); if ( 1 ) { kmerCounter = ( long long * ) ckalloc ( ( thrd_num + 1 ) * sizeof ( long long ) ); KmerSets = ( KmerSet ** ) ckalloc ( thrd_num * sizeof ( KmerSet * ) ); ubyte8 init_size = 1024; ubyte8 k = 0; if ( initKmerSetSize ) { init_size = ( ubyte8 ) ( ( double ) initKmerSetSize * 1024.0f * 1024.0f * 1024.0f / ( double ) thrd_num / 32 ); do { ++k; } while ( k * 0xFFFFFFLLU < init_size ); } for ( i = 0; i < thrd_num; i++ ) { //KmerSets[i] = init_kmerset(1024,0.77f); KmerSets[i] = init_kmerset ( k * 0xFFFFFFLLU, 0.77f ); thrdSignal[i + 1] = 0; paras[i].threadID = i; paras[i].mainSignal = &thrdSignal[0]; paras[i].selfSignal = &thrdSignal[i + 1]; kmerCounter[i + 1] = 0; rcSeq[i + 1] = ( char * ) ckalloc ( maxReadLen * sizeof ( char ) ); } creatThrds ( threads, paras ); } thrdSignal[0] = kmerCounter[0] = 0; time ( &start_t ); kmer_c = n_solexa = read_c = i = libNo = readNumBack = gradsCounter = 0; while ( ( flag = read1seqInLib ( seqBuffer[read_c], next_name, & ( lenBuffer[read_c] ), &libNo, pairs, 1 ) ) != 0 ) { if ( ( ++i ) % 100000000 == 0 ) { printf ( "--- %lldth reads\n", i ); } if ( lenBuffer[read_c] < 0 ) { printf ( "read len %d\n", lenBuffer[read_c] ); } if ( lenBuffer[read_c] < overlaplen + 1 ) { continue; } /* if(lenBuffer[read_c]>70) lenBuffer[read_c] = 50; else if(lenBuffer[read_c]>40) lenBuffer[read_c] = 40; */ indexArray[read_c] = kmer_c; kmer_c += lenBuffer[read_c] - overlaplen + 1; read_c++; if ( read_c == maxReadNum ) { kmerCounter[0] += kmer_c; sendWorkSignal ( 2, thrdSignal ); sendWorkSignal ( 1, thrdSignal ); kmer_c = read_c = 0; } } if ( read_c ) { kmerCounter[0] += kmer_c; sendWorkSignal ( 2, thrdSignal ); sendWorkSignal ( 1, thrdSignal ); } time ( &stop_t ); printf ( "time spent on hash reads: %ds, %lld reads processed\n", ( int ) ( stop_t - start_t ), i ); //record insert size info if ( pairs ) { if ( gradsCounter ) printf ( "%d pe insert size, the largest boundary is %lld\n\n", gradsCounter, pes[gradsCounter - 1].PE_bound ); else { printf ( "no paired reads found\n" ); } sprintf ( name, "%s.peGrads", outfile ); fo = ckopen ( name, "w" ); fprintf ( fo, "grads&num: %d\t%lld\n", gradsCounter, n_solexa ); for ( i = 0; i < gradsCounter; i++ ) { fprintf ( fo, "%d\t%lld\t%d\n", pes[i].insertS, pes[i].PE_bound, pes[i].rank ); } fclose ( fo ); } free_pe_mem(); free_libs(); if ( 1 ) { unsigned long long alloCounter = 0; unsigned long long allKmerCounter = 0; for ( i = 0; i < thrd_num; i++ ) { alloCounter += count_kmerset ( ( KmerSets[i] ) ); allKmerCounter += kmerCounter[i + 1]; free ( ( void * ) rcSeq[i + 1] ); } printf ( "%lli nodes allocated, %lli kmer in reads, %lli kmer processed\n" , alloCounter, kmerCounter[0], allKmerCounter ); } free ( ( void * ) rcSeq ); free ( ( void * ) kmerCounter ); for ( i = 0; i < maxReadNum; i++ ) { free ( ( void * ) seqBuffer[i] ); } free ( ( void * ) seqBuffer ); free ( ( void * ) lenBuffer ); free ( ( void * ) indexArray ); free ( ( void * ) kmerBuffer ); free ( ( void * ) hashBanBuffer ); free ( ( void * ) nextcBuffer ); free ( ( void * ) prevcBuffer ); free ( ( void * ) next_name ); //printf("done hashing nodes\n"); if ( deLowKmer ) { time ( &start_t ); deLowCov ( thrdSignal ); time ( &stop_t ); printf ( "time spent on delowcvgNode %ds\n", ( int ) ( stop_t - start_t ) ); } time ( &start_t ); Mark1in1outNode ( thrdSignal ); freqStat ( outfile ); time ( &stop_t ); printf ( "time spent on marking linear nodes %ds\n", ( int ) ( stop_t - start_t ) ); fflush ( stdout ); sendWorkSignal ( 3, thrdSignal ); thread_wait ( threads ); /* Kmer word = 0x21c3ca82c734c8d0; Kmer hash_ban = hash_kmer(word); int setPicker = hash_ban%thrd_num; kmer_t *node; boolean found = search_kmerset(KmerSets[setPicker], word, &node); if(!found) printf("kmer %llx not found,\n",word); else{ printf("kmer %llx, linear %d\n",word,node->linear); for(i=0;i<4;i++){ if(get_kmer_right_cov(*node,i)>0) printf("right %d, kmer %llx\n",i,nextKmer(node->seq,i)); if(get_kmer_left_cov(*node,i)>0) printf("left %d, kmer %llx\n",i,prevKmer(node->seq,i)); } } */ return 1; }