/************************************************* Function: prlRead2HashTable Description: 1. Imports the reads from the lib file one by one. 2. Chops the reads into kmers and store them in KmerSets. 3. Removes the kmers with low coverage. 4. Marks the linear kmers. 5. Counts the kmer frequences. Input: 1. libfile : the reads config file 2. outfile : the output file prefix Output: None. Return: 1 if exits normally. *************************************************/ boolean prlRead2HashTable ( char * libfile, char * outfile ) { char * cach1; char * cach2; unsigned char asm_ctg = 1; long long i; char * next_name, name[256]; FILE * fo; time_t start_t, stop_t; int maxReadNum; int libNo; pthread_t threads[thrd_num]; unsigned char thrdSignal[thrd_num + 1]; PARAMETER paras[thrd_num]; boolean flag, pairs = 0; WORDFILTER = createFilter ( overlaplen ); maxReadLen = 0; maxNameLen = 256; scan_libInfo ( libfile ); alloc_pe_mem ( num_libs ); if ( !maxReadLen ) { maxReadLen = 100; } if ( gLineLen < maxReadLen ) { gStr = ( char * ) ckalloc ( ( maxReadLen + 1 ) * sizeof ( char ) ); } //init maxReadLen4all = maxReadLen; fprintf ( stderr, "In %s, %d lib(s), maximum read length %d, maximum name length %d.\n\n", libfile, num_libs, maxReadLen, maxNameLen ); next_name = ( char * ) ckalloc ( ( maxNameLen + 1 ) * sizeof ( char ) ); kmerBuffer = ( Kmer * ) ckalloc ( buffer_size * sizeof ( Kmer ) ); hashBanBuffer = ( ubyte8 * ) ckalloc ( buffer_size * sizeof ( ubyte8 ) ); prevcBuffer = ( char * ) ckalloc ( buffer_size * sizeof ( char ) ); nextcBuffer = ( char * ) ckalloc ( buffer_size * sizeof ( char ) ); maxReadNum = buffer_size / ( maxReadLen - overlaplen + 1 ); //printf("buffer size %d, max read len %d, max read num %d\n",buffer_size,maxReadLen,maxReadNum); int maxAIOSize = 32768; aioBuffer1 = ( char * ) ckalloc ( ( maxAIOSize ) * sizeof ( char ) ); aioBuffer2 = ( char * ) ckalloc ( ( maxAIOSize ) * sizeof ( char ) ); readBuffer1 = ( char * ) ckalloc ( ( maxAIOSize + ( maxReadLen * 4 + 1024 ) ) * sizeof ( char ) ); //(char *)ckalloc(maxAIOSize*sizeof(char)); //1024 readBuffer2 = ( char * ) ckalloc ( ( maxAIOSize + ( maxReadLen * 4 + 1024 ) ) * sizeof ( char ) ); //1024 cach1 = ( char * ) ckalloc ( ( maxReadLen * 4 + 1024 ) * sizeof ( char ) ); //1024 cach2 = ( char * ) ckalloc ( ( maxReadLen * 4 + 1024 ) * sizeof ( char ) ); //1024 memset ( cach1, '\0', ( maxReadLen * 4 + 1024 ) ); //1024 memset ( cach2, '\0', ( maxReadLen * 4 + 1024 ) ); //1024 seqBuffer = ( char ** ) ckalloc ( maxReadNum * sizeof ( char * ) ); lenBuffer = ( int * ) ckalloc ( maxReadNum * sizeof ( int ) ); indexArray = ( int * ) ckalloc ( maxReadNum * sizeof ( int ) ); for ( i = 0; i < maxReadNum; i++ ) { seqBuffer[i] = ( char * ) ckalloc ( maxReadLen * sizeof ( char ) ); } rcSeq = ( char ** ) ckalloc ( ( thrd_num + 1 ) * sizeof ( char * ) ); if ( 1 ) { kmerCounter = ( long long * ) ckalloc ( ( thrd_num + 1 ) * sizeof ( long long ) ); KmerSets = ( KmerSet ** ) ckalloc ( thrd_num * sizeof ( KmerSet * ) ); ubyte8 init_size = 1024; ubyte8 k = 0; if ( initKmerSetSize ) { #ifdef MER127 init_size = ( ubyte8 ) ( ( double ) initKmerSetSize * 1024.0f * 1024.0f * 1024.0f / ( double ) thrd_num / 40 ); #else init_size = ( ubyte8 ) ( ( double ) initKmerSetSize * 1024.0f * 1024.0f * 1024.0f / ( double ) thrd_num / 24 ); //is it true? #endif do { ++k; } while ( k * 0xFFFFFFLLU < init_size ); } for ( i = 0; i < thrd_num; i++ ) { //KmerSets[i] = init_kmerset(1024,0.77f); KmerSets[i] = init_kmerset ( ( ( initKmerSetSize ) ? ( k * 0xFFFFFFLLU ) : ( init_size ) ), 0.77f ); thrdSignal[i + 1] = 0; paras[i].threadID = i; paras[i].mainSignal = &thrdSignal[0]; paras[i].selfSignal = &thrdSignal[i + 1]; kmerCounter[i + 1] = 0; rcSeq[i + 1] = ( char * ) ckalloc ( maxReadLen * sizeof ( char ) ); } creatThrds ( threads, paras ); } thrdSignal[0] = kmerCounter[0] = 0; time ( &start_t ); kmer_c = n_solexa = read_c = i = libNo = readNumBack = gradsCounter = 0; while ( openNextFile ( &libNo, pairs, asm_ctg ) ) { //read bam file if ( lib_array[libNo].curr_type == 4 ) { int type = 0; //deside the PE reads is good or bad while ( ( flag = read1seqInLibBam ( seqBuffer[read_c], next_name, & ( lenBuffer[read_c] ), &libNo, pairs, 1, &type ) ) != 0 ) { if ( type == -1 ) //if the reads is bad, go back. { i--; if ( lenBuffer[read_c - 1] >= overlaplen + 1 ) { kmer_c -= lenBuffer[read_c - 1] - overlaplen + 1; read_c--; } n_solexa -= 2; continue; } if ( ( ++i ) % 100000000 == 0 ) { fprintf ( stderr, "--- %lldth reads.\n", i ); } if ( lenBuffer[read_c] < 0 ) { fprintf ( stderr, "Read len %d.\n", lenBuffer[read_c] ); } if ( lenBuffer[read_c] < overlaplen + 1 ) { continue; } /* if(lenBuffer[read_c]>70) lenBuffer[read_c] = 50; else if(lenBuffer[read_c]>40) lenBuffer[read_c] = 40; */ indexArray[read_c] = kmer_c; kmer_c += lenBuffer[read_c] - overlaplen + 1; read_c++; if ( read_c == maxReadNum ) { kmerCounter[0] += kmer_c; sendWorkSignal ( 2, thrdSignal ); //chopKmer4read sendWorkSignal ( 1, thrdSignal ); //singleKmer kmer_c = read_c = 0; } } } //read PE fasta or fastq else if ( lib_array[libNo].curr_type == 1 || lib_array[libNo].curr_type == 2 ) { initAIO ( &aio1, aioBuffer1, fileno ( lib_array[libNo].fp1 ), maxAIOSize ); initAIO ( &aio2, aioBuffer2, fileno ( lib_array[libNo].fp2 ), maxAIOSize ); int offset1, offset2, flag1, flag2, rt1, rt2; offset1 = offset2 = 0; rt1 = aio_read ( &aio1 ); rt2 = aio_read ( &aio2 ); flag1 = AIORead ( &aio1, &offset1, readBuffer1, cach1, &rt1, lib_array[libNo].curr_type ); flag2 = AIORead ( &aio2, &offset2, readBuffer2, cach2, &rt2, lib_array[libNo].curr_type ); if ( flag1 && flag2 ) { int start1, start2, turn; start1 = start2 = 0; turn = 1; while ( start1 < offset1 || start2 < offset2 ) { if ( turn == 1 ) { turn = 2; readseqInLib ( seqBuffer[read_c], next_name, & ( lenBuffer[read_c] ), readBuffer1, &start1, offset1, libNo ); if ( ( ++i ) % 100000000 == 0 ) { fprintf ( stderr, "--- %lldth reads.\n", i ); } if ( lenBuffer[read_c] < 0 ) { fprintf ( stderr, "Read len %d.\n", lenBuffer[read_c] ); } if ( lenBuffer[read_c] < overlaplen + 1 ) { if ( start1 >= offset1 ) { start1 = 0; offset1 = 0; flag1 = AIORead ( &aio1, &offset1, readBuffer1, cach1, &rt1, lib_array[libNo].curr_type ); } continue; } indexArray[read_c] = kmer_c; kmer_c += lenBuffer[read_c] - overlaplen + 1; read_c++; if ( start1 >= offset1 ) { start1 = 0; offset1 = 0; flag1 = AIORead ( &aio1, &offset1, readBuffer1, cach1, &rt1, lib_array[libNo].curr_type ); } if ( read_c == maxReadNum ) { kmerCounter[0] += kmer_c; sendWorkSignal ( 2, thrdSignal ); //chopKmer4read sendWorkSignal ( 1, thrdSignal ); //singleKmer kmer_c = read_c = 0; } continue; } if ( turn == 2 ) { turn = 1; readseqInLib ( seqBuffer[read_c], next_name, & ( lenBuffer[read_c] ), readBuffer2, &start2, offset2, libNo ); if ( ( ++i ) % 100000000 == 0 ) { fprintf ( stderr, "--- %lldth reads.\n", i ); } if ( lenBuffer[read_c] < 0 ) { fprintf ( stderr, "Read len %d.\n", lenBuffer[read_c] ); } if ( lenBuffer[read_c] < overlaplen + 1 ) { if ( ( flag2 == 2 ) && ( start2 >= offset2 ) ) { break; } if ( start2 >= offset2 ) { start2 = 0; offset2 = 0; flag2 = AIORead ( &aio2, &offset2, readBuffer2, cach2, &rt2, lib_array[libNo].curr_type ); } continue; } indexArray[read_c] = kmer_c; kmer_c += lenBuffer[read_c] - overlaplen + 1; read_c++; if ( ( flag2 == 2 ) && ( start2 >= offset2 ) ) { break; } if ( start2 >= offset2 ) { start2 = 0; offset2 = 0; flag2 = AIORead ( &aio2, &offset2, readBuffer2, cach2, &rt2, lib_array[libNo].curr_type ); } if ( read_c == maxReadNum ) { kmerCounter[0] += kmer_c; sendWorkSignal ( 2, thrdSignal ); //chopKmer4read sendWorkSignal ( 1, thrdSignal ); //singleKmer kmer_c = read_c = 0; } continue; } } } else { fprintf(stderr, "Error: aio_read error.\n"); } } //read single fasta, single fastq and PE fasta in one file else { initAIO ( &aio1, aioBuffer1, fileno ( lib_array[libNo].fp1 ), maxAIOSize ); int offset, flag1, rt; offset = 0; rt = aio_read ( &aio1 ); while ( ( flag1 = AIORead ( &aio1, &offset, readBuffer1, cach1, &rt, lib_array[libNo].curr_type ) ) ) { int start = 0; while ( start < offset ) { readseqInLib ( seqBuffer[read_c], next_name, & ( lenBuffer[read_c] ), readBuffer1, &start, offset, libNo ); if ( ( ++i ) % 100000000 == 0 ) { fprintf ( stderr, "--- %lldth reads.\n", i ); } if ( lenBuffer[read_c] < 0 ) { fprintf ( stderr, "Read len %d.\n", lenBuffer[read_c] ); } if ( lenBuffer[read_c] < overlaplen + 1 ) { continue; } indexArray[read_c] = kmer_c; kmer_c += lenBuffer[read_c] - overlaplen + 1; read_c++; } if ( read_c > maxReadNum - 1024 ) { kmerCounter[0] += kmer_c; sendWorkSignal ( 2, thrdSignal ); //chopKmer4read sendWorkSignal ( 1, thrdSignal ); //singleKmer kmer_c = read_c = 0; } if ( flag1 == 2 ) { break; } } } } if ( read_c ) { kmerCounter[0] += kmer_c; sendWorkSignal ( 2, thrdSignal ); //chopKmer4read sendWorkSignal ( 1, thrdSignal ); //singleKmer } time ( &stop_t ); fprintf ( stderr, "Time spent on hashing reads: %ds, %lld read(s) processed.\n", ( int ) ( stop_t - start_t ), i ); //record insert size info if ( pairs ) { if ( gradsCounter ) { fprintf ( stderr, "%d pe insert size, the largest boundary is %lld.\n\n", gradsCounter, pes[gradsCounter - 1].PE_bound ); } else { fprintf ( stderr, "No paired reads found.\n" ); } sprintf ( name, "%s.peGrads", outfile ); fo = ckopen ( name, "w" ); fprintf ( fo, "grads&num: %d\t%lld\n", gradsCounter, n_solexa ); for ( i = 0; i < gradsCounter; i++ ) { fprintf ( fo, "%d\t%lld\t%d\n", pes[i].insertS, pes[i].PE_bound, pes[i].rank ); } fclose ( fo ); } free_pe_mem (); free_libs (); if ( 1 ) { unsigned long long alloCounter = 0; unsigned long long allKmerCounter = 0; for ( i = 0; i < thrd_num; i++ ) { alloCounter += count_kmerset ( ( KmerSets[i] ) ); allKmerCounter += kmerCounter[i + 1]; free ( ( void * ) rcSeq[i + 1] ); } fprintf ( stderr, "%lli node(s) allocated, %lli kmer(s) in reads, %lli kmer(s) processed.\n", alloCounter, kmerCounter[0], allKmerCounter ); } free ( ( void * ) rcSeq ); free ( ( void * ) kmerCounter ); for ( i = 0; i < maxReadNum; i++ ) { free ( ( void * ) seqBuffer[i] ); } free ( ( void * ) seqBuffer ); free ( ( void * ) lenBuffer ); free ( ( void * ) indexArray ); free ( ( void * ) kmerBuffer ); free ( ( void * ) hashBanBuffer ); free ( ( void * ) nextcBuffer ); free ( ( void * ) prevcBuffer ); free ( ( void * ) next_name ); free ( ( void * ) aioBuffer1 ); free ( ( void * ) aioBuffer2 ); free ( ( void * ) readBuffer1 ); free ( ( void * ) readBuffer2 ); free ( ( void * ) cach1 ); free ( ( void * ) cach2 ); fprintf ( stderr, "done hashing nodes\n" ); if ( deLowKmer ) { time ( &start_t ); deLowCov ( thrdSignal ); time ( &stop_t ); fprintf ( stderr, "Time spent on delowcvgNode: %ds.\n", ( int ) ( stop_t - start_t ) ); } time ( &start_t ); Mark1in1outNode ( thrdSignal ); freqStat ( outfile ); time ( &stop_t ); fprintf ( stderr, "Time spent on marking linear nodes: %ds.\n", ( int ) ( stop_t - start_t ) ); sendWorkSignal ( 3, thrdSignal ); //exit thread_wait ( threads ); return 1; }
void prlRead2edge (char *libfile, char *outfile) { char *cach1; char *cach2; unsigned char asm_ctg = 1; long long i; char name[256], *src_name, *next_name; FILE *outfp = NULL; int maxReadNum, libNo; boolean flag, pairs = 0; pthread_t threads[thrd_num]; unsigned char thrdSignal[thrd_num + 1]; PARAMETER paras[thrd_num]; maxReadLen = 0; maxNameLen = 256; scan_libInfo (libfile); alloc_pe_mem (num_libs); if (!maxReadLen) { maxReadLen = 100; } maxReadLen4all = maxReadLen; printf ("In file: %s, max seq len %d, max name len %d\n\n", libfile, maxReadLen, maxNameLen); if (repsTie) { sprintf (name, "%s.path", outfile); outfp = ckopen (name, "wb"); } src_name = (char *) ckalloc ((maxNameLen + 1) * sizeof (char)); next_name = (char *) ckalloc ((10*maxNameLen + 1) * sizeof (char)); kmerBuffer = (Kmer *) ckalloc (buffer_size * sizeof (Kmer)); mixBuffer = (Kmer *) ckalloc (buffer_size * sizeof (Kmer)); hashBanBuffer = (ubyte8 *) ckalloc (buffer_size * sizeof (ubyte8)); nodeBuffer = (kmer_t **) ckalloc (buffer_size * sizeof (kmer_t *)); smallerBuffer = (boolean *) ckalloc (buffer_size * sizeof (boolean)); flagArray = (boolean *) ckalloc (buffer_size * sizeof (boolean)); maxReadNum = buffer_size / (maxReadLen - overlaplen + 1); //printf("buffer for at most %d reads\n",maxReadNum); int maxAIOSize = 32768;/* aioBuffer1 = (char *) ckalloc ((maxAIOSize) * sizeof (char)); aioBuffer2 = (char *) ckalloc ((maxAIOSize) * sizeof (char)); readBuffer1 = (char *) ckalloc ((maxAIOSize + 1024) * sizeof (char)); //(char *)ckalloc(maxAIOSize*sizeof(char)); readBuffer2 = (char *) ckalloc ((maxAIOSize + 1024) * sizeof (char)); cach1 = (char *) ckalloc (1024 * sizeof (char)); cach2 = (char *) ckalloc (1024 * sizeof (char)); memset(cach1,'\0',1024); memset(cach2,'\0',1024);*/ aioBuffer1 = (char *) ckalloc ((maxAIOSize) * sizeof (char)); aioBuffer2 = (char *) ckalloc ((maxAIOSize) * sizeof (char)); readBuffer1 = (char *) ckalloc ((maxAIOSize + (maxReadLen+1024)) * sizeof (char)); //(char *)ckalloc(maxAIOSize*sizeof(char)); //1024 readBuffer2 = (char *) ckalloc ((maxAIOSize + (maxReadLen+1024)) * sizeof (char)); //1024 cach1 = (char *) ckalloc ((maxReadLen+1024) * sizeof (char)); //1024 cach2 = (char *) ckalloc ((maxReadLen+1024) * sizeof (char)); //1024 memset(cach1,'\0',(maxReadLen+1024)); //1024 memset(cach2,'\0',(maxReadLen+1024)); //1024 seqBuffer = (char **) ckalloc (maxReadNum * sizeof (char *)); lenBuffer = (int *) ckalloc (maxReadNum * sizeof (int)); indexArray = (int *) ckalloc ((maxReadNum + 1) * sizeof (int)); for (i = 0; i < maxReadNum; i++) { seqBuffer[i] = (char *) ckalloc (maxReadLen * sizeof (char)); } memoAlloc4preArc (); flags = (char **) ckalloc ((thrd_num + 1) * sizeof (char *)); deletion = (int *) ckalloc ((thrd_num + 1) * sizeof (int)); rcSeq = (char **) ckalloc ((thrd_num + 1) * sizeof (char *)); if (repsTie) { markerOnEdge = (unsigned char *) ckalloc ((num_ed + 1) * sizeof (unsigned char)); for (i = 1; i <= num_ed; i++) { markerOnEdge[i] = 0; } fwriteBuf = (unsigned int *) ckalloc ((maxReadLen - overlaplen + 1) * sizeof (unsigned int)); } thrdSignal[0] = 0; if (1) { preArc_mem_managers = (MEM_MANAGER **) ckalloc (thrd_num * sizeof (MEM_MANAGER *)); arcCounters = (unsigned int *) ckalloc (thrd_num * sizeof (unsigned int)); for (i = 0; i < thrd_num; i++) { arcCounters[i] = 0; preArc_mem_managers[i] = createMem_manager (preARCBLOCKSIZE, sizeof (preARC)); deletion[i + 1] = 0; flags[i + 1] = (char *) ckalloc (2 * maxReadLen * sizeof (char)); rcSeq[i + 1] = (char *) ckalloc (maxReadLen * sizeof (char)); thrdSignal[i + 1] = 0; paras[i].threadID = i; paras[i].mainSignal = &thrdSignal[0]; paras[i].selfSignal = &thrdSignal[i + 1]; } creatThrds (threads, paras); } if (1) { deletion[0] = 0; flags[0] = (char *) ckalloc (2 * maxReadLen * sizeof (char)); rcSeq[0] = (char *) ckalloc (maxReadLen * sizeof (char)); } kmer_c = n_solexa = read_c = i = libNo = readNumBack = gradsCounter = 0; int t0, t1, t2, t3, t4, t5, t6; t0 = t1 = t2 = t3 = t4 = t5 = t6 = 0; time_t read_start, read_end, time_bef, time_aft; time (&read_start); while (openNextFile (&libNo, pairs, asm_ctg)) { if (lib_array[libNo].curr_type == 4) { int type = 0; //deside the PE reads is good or bad while ((flag = read1seqInLibBam (seqBuffer[read_c], next_name, &(lenBuffer[read_c]), &libNo, pairs, 1, &type)) != 0) { if (type == -1) //if the reads is bad, go back. { i--; if (lenBuffer[read_c - 1] >= overlaplen + 1) { kmer_c -= lenBuffer[read_c - 1] - overlaplen + 1; read_c--; } n_solexa -= 2; continue; } if ((++i) % 1000000 == 0) { printf ("--- %lldth reads\n", i); } if (lenBuffer[read_c] < overlaplen + 1) { continue; } //if(lenBuffer[read_c]>70) // lenBuffer[read_c] = 70; //else if(lenBuffer[read_c]>40) // lenBuffer[read_c] = 40; indexArray[read_c] = kmer_c; kmer_c += lenBuffer[read_c] - overlaplen + 1; read_c++; if (read_c == maxReadNum) { indexArray[read_c] = kmer_c; time (&read_end); t0 += read_end - read_start; time (&time_bef); sendWorkSignal (2, thrdSignal); time (&time_aft); t1 += time_aft - time_bef; time (&time_bef); sendWorkSignal (1, thrdSignal); time (&time_aft); t2 += time_aft - time_bef; time (&time_bef); sendWorkSignal (3, thrdSignal); time (&time_aft); t3 += time_aft - time_bef; time (&time_bef); sendWorkSignal (4, thrdSignal); time (&time_aft); t4 += time_aft - time_bef; time (&time_bef); sendWorkSignal (6, thrdSignal); time (&time_aft); t5 += time_aft - time_bef; time (&time_bef); //recordPreArc(); if (repsTie) { recordPathBin (outfp); } time (&time_aft); t6 += time_aft - time_bef; //output_path(read_c,edge_no,flags,outfp); kmer_c = 0; read_c = 0; time (&read_start); } } } else if (lib_array[libNo].curr_type == 1 || lib_array[libNo].curr_type == 2) { initAIO (&aio1, aioBuffer1, fileno (lib_array[libNo].fp1), maxAIOSize); initAIO (&aio2, aioBuffer2, fileno (lib_array[libNo].fp2), maxAIOSize); int offset1, offset2, flag1, flag2, rt1, rt2; offset1 = offset2 = 0; rt1 = aio_read (&aio1); rt2 = aio_read (&aio2); flag1 = AIORead (&aio1, &offset1, readBuffer1, cach1, &rt1, lib_array[libNo].curr_type); flag2 = AIORead (&aio2, &offset2, readBuffer2, cach2, &rt2, lib_array[libNo].curr_type); if(flag1 && flag2) { int start1, start2, turn; start1 = start2 = 0; turn = 1; while (start1 < offset1 || start2 < offset2) { if (turn == 1) { turn = 2; readseqInLib (seqBuffer[read_c], next_name, &(lenBuffer[read_c]), readBuffer1, &start1, offset1, libNo); if ((++i) % 1000000 == 0) printf ("--- %lldth reads\n", i); /* if (lenBuffer[read_c] < overlaplen + 1) continue;*/ if (lenBuffer[read_c] < overlaplen + 1) { if(start1>=offset1) { start1=0; flag1=AIORead (&aio1, &offset1, readBuffer1, cach1, &rt1, lib_array[libNo].curr_type); } continue; } indexArray[read_c] = kmer_c; kmer_c += lenBuffer[read_c] - overlaplen + 1; read_c++; if(start1>=offset1){ start1=0; flag1=AIORead (&aio1, &offset1, readBuffer1, cach1, &rt1, lib_array[libNo].curr_type); } if (read_c == maxReadNum) { indexArray[read_c] = kmer_c; time (&read_end); t0 += read_end - read_start; time (&time_bef); sendWorkSignal (2, thrdSignal); time (&time_aft); t1 += time_aft - time_bef; time (&time_bef); sendWorkSignal (1, thrdSignal); time (&time_aft); t2 += time_aft - time_bef; time (&time_bef); sendWorkSignal (3, thrdSignal); time (&time_aft); t3 += time_aft - time_bef; time (&time_bef); sendWorkSignal (4, thrdSignal); time (&time_aft); t4 += time_aft - time_bef; time (&time_bef); sendWorkSignal (6, thrdSignal); time (&time_aft); t5 += time_aft - time_bef; time (&time_bef); //recordPreArc(); if (repsTie) recordPathBin (outfp); time (&time_aft); t6 += time_aft - time_bef; //output_path(read_c,edge_no,flags,outfp); kmer_c = 0; read_c = 0; time (&read_start); } continue; } if (turn == 2) { turn = 1; readseqInLib (seqBuffer[read_c], next_name, &(lenBuffer[read_c]), readBuffer2, &start2, offset2, libNo); if ((++i) % 1000000 == 0) printf ("--- %lldth reads\n", i); /* if (lenBuffer[read_c] < overlaplen + 1) continue;*/ if (lenBuffer[read_c] < overlaplen + 1) { if((flag2 == 2) && (start2 >= offset2)) break; if(start2 >= offset2) { start2=0; flag2 = AIORead (&aio2, &offset2, readBuffer2, cach2, &rt2, lib_array[libNo].curr_type); } continue; } indexArray[read_c] = kmer_c; kmer_c += lenBuffer[read_c] - overlaplen + 1; read_c++; if((flag2 == 2) && (start2 >= offset2)) break; if(start2 >= offset2){ start2=0; flag2 = AIORead (&aio2, &offset2, readBuffer2, cach2, &rt2, lib_array[libNo].curr_type); } if (read_c == maxReadNum){ indexArray[read_c] = kmer_c; time (&read_end); t0 += read_end - read_start; time (&time_bef); sendWorkSignal (2, thrdSignal); time (&time_aft); t1 += time_aft - time_bef; time (&time_bef); sendWorkSignal (1, thrdSignal); time (&time_aft); t2 += time_aft - time_bef; time (&time_bef); sendWorkSignal (3, thrdSignal); time (&time_aft); t3 += time_aft - time_bef; time (&time_bef); sendWorkSignal (4, thrdSignal); time (&time_aft); t4 += time_aft - time_bef; time (&time_bef); sendWorkSignal (6, thrdSignal); time (&time_aft); t5 += time_aft - time_bef; time (&time_bef); //recordPreArc(); if (repsTie) recordPathBin (outfp); time (&time_aft); t6 += time_aft - time_bef; //output_path(read_c,edge_no,flags,outfp); kmer_c = 0; read_c = 0; time (&read_start); } continue; } } } } else { initAIO (&aio1, aioBuffer1, fileno (lib_array[libNo].fp1), maxAIOSize); int offset, flag1, rt; offset = 0; rt = aio_read (&aio1); while ((flag1 = AIORead (&aio1, &offset, readBuffer1, cach1, &rt, lib_array[libNo].curr_type))) { int start = 0; while (start < offset) { readseqInLib (seqBuffer[read_c], next_name, &(lenBuffer[read_c]), readBuffer1, &start, offset, libNo); if ((++i) % 1000000 == 0) printf ("--- %lld reads\n", i); if (lenBuffer[read_c] < overlaplen + 1) continue; indexArray[read_c] = kmer_c; kmer_c += lenBuffer[read_c] - overlaplen + 1; read_c++; if (read_c > maxReadNum - 1024) { indexArray[read_c] = kmer_c; time (&read_end); t0 += read_end - read_start; time (&time_bef); sendWorkSignal (2, thrdSignal); time (&time_aft); t1 += time_aft - time_bef; time (&time_bef); sendWorkSignal (1, thrdSignal); time (&time_aft); t2 += time_aft - time_bef; time (&time_bef); sendWorkSignal (3, thrdSignal); time (&time_aft); t3 += time_aft - time_bef; time (&time_bef); sendWorkSignal (4, thrdSignal); time (&time_aft); t4 += time_aft - time_bef; time (&time_bef); sendWorkSignal (6, thrdSignal); time (&time_aft); t5 += time_aft - time_bef; time (&time_bef); //recordPreArc(); if (repsTie) recordPathBin (outfp); time (&time_aft); t6 += time_aft - time_bef; //output_path(read_c,edge_no,flags,outfp); kmer_c = 0; read_c = 0; time (&read_start); } } if (flag1 == 2) break; } } } printf ("%lld reads processed\n", i); printf ("time %d,%d,%d,%d,%d,%d,%d\n", t0, t1, t2, t3, t4, t5, t6); if (read_c) { indexArray[read_c] = kmer_c; sendWorkSignal (2, thrdSignal); sendWorkSignal (1, thrdSignal); sendWorkSignal (3, thrdSignal); sendWorkSignal (4, thrdSignal); sendWorkSignal (6, thrdSignal); //recordPreArc(); if (repsTie) { recordPathBin (outfp); } } printf ("%lld markers outputed\n", markCounter); sendWorkSignal (5, thrdSignal); thread_wait (threads); output_arcs (outfile); memoFree4preArc (); if (1) // multi-threads { arcCounter = 0; for (i = 0; i < thrd_num; i++) { arcCounter += arcCounters[i]; free ((void *) flags[i + 1]); deletion[0] += deletion[i + 1]; free ((void *) rcSeq[i + 1]); } } if (1) { free ((void *) flags[0]); free ((void *) rcSeq[0]); } printf ("done mapping reads, %d reads deleted, %lld arcs created\n", deletion[0], arcCounter); if (repsTie) { free ((void *) markerOnEdge); free ((void *) fwriteBuf); } free ((void *) arcCounters); free ((void *) rcSeq); for (i = 0; i < maxReadNum; i++) { free ((void *) seqBuffer[i]); } free ((void *) seqBuffer); free ((void *) lenBuffer); free ((void *) indexArray); free ((void *) flags); free ((void *) deletion); free ((void *) kmerBuffer); free ((void *) mixBuffer); free ((void *) smallerBuffer); free ((void *) flagArray); free ((void *) hashBanBuffer); free ((void *) nodeBuffer); free ((void *) src_name); free ((void *) next_name); free ((void *) aioBuffer1); free ((void *) aioBuffer2); free ((void *) readBuffer1); free ((void *) readBuffer2); free ((void *) cach1); free ((void *) cach2); if (repsTie) { fclose (outfp); } free_pe_mem (); free_libs (); }
boolean prlContig2nodes (char *grapfile, int len_cut) { long long i, num_seq; char name[256], *next_name; FILE *fp; pthread_t threads[thrd_num]; time_t start_t, stop_t; unsigned char thrdSignal[thrd_num + 1]; PARAMETER paras[thrd_num]; int maxCtgLen, minCtgLen, nameLen; unsigned int lenSum, contigId; WORDFILTER = createFilter (overlaplen); time (&start_t); sprintf (name, "%s.contig", grapfile); fp = ckopen (name, "r"); maxCtgLen = nameLen = 10; minCtgLen = 1000; num_seq = readseqpar (&maxCtgLen, &minCtgLen, &nameLen, fp); printf ("\nthere're %lld contigs in file: %s, max seq len %d, min seq len %d, max name len %d\n", num_seq, grapfile, maxCtgLen, minCtgLen, nameLen); maxReadLen = maxCtgLen; fclose (fp); time (&stop_t); printf ("time spent on parse contigs file %ds\n", (int) (stop_t - start_t)); next_name = (char *) ckalloc ((maxNameLen + 1) * sizeof (char)); // extract all the EDONs seq_buffer_size = buffer_size * 2; max_read_c = seq_buffer_size / 20; kmerBuffer = (Kmer *) ckalloc (buffer_size * sizeof (Kmer)); hashBanBuffer = (ubyte8 *) ckalloc (buffer_size * sizeof (ubyte8)); smallerBuffer = (boolean *) ckalloc (buffer_size * sizeof (boolean)); seqBuffer = (char *) ckalloc (seq_buffer_size * sizeof (char)); lenBuffer = (int *) ckalloc (max_read_c * sizeof (int)); indexArray = (unsigned int *) ckalloc ((max_read_c + 1) * sizeof (unsigned int)); seqBreakers = (unsigned int *) ckalloc ((max_read_c + 1) * sizeof (unsigned int)); ctgIdArray = (int *) ckalloc (max_read_c * sizeof (int)); fp = ckopen (name, "r"); //node_mem_manager = createMem_manager(EDONBLOCKSIZE,sizeof(EDON)); rcSeq = (char **) ckalloc ((thrd_num + 1) * sizeof (char *)); if (1) { kmerCounter = (long long *) ckalloc ((thrd_num + 1) * sizeof (long long)); KmerSets = (KmerSet **) ckalloc (thrd_num * sizeof (KmerSet *)); for (i = 0; i < thrd_num; i++) { KmerSets[i] = init_kmerset (1024, 0.77f); thrdSignal[i + 1] = 0; paras[i].threadID = i; paras[i].mainSignal = &thrdSignal[0]; paras[i].selfSignal = &thrdSignal[i + 1]; kmerCounter[i + 1] = 0; rcSeq[i + 1] = (char *) ckalloc (maxCtgLen * sizeof (char)); } creatThrds (threads, paras); } kmer_c = thrdSignal[0] = kmerCounter[0] = 0; time (&start_t); read_c = lenSum = i = seqBreakers[0] = indexArray[0] = 0; readseq1by1 (seqBuffer + seqBreakers[read_c], next_name, &(lenBuffer[read_c]), fp, -1); while (!feof (fp)) { contigId = getID (next_name); readseq1by1 (seqBuffer + seqBreakers[read_c], next_name, &(lenBuffer[read_c]), fp, 1); if ((++i) % 10000000 == 0) { printf ("--- %lldth contigs\n", i); } if (lenBuffer[read_c] < overlaplen + 1 || lenBuffer[read_c] < len_cut) { contigId = getID (next_name); continue; } //printf("len of seq %d is %d, ID %d\n",read_c,lenBuffer[read_c],contigId); ctgIdArray[read_c] = contigId > 0 ? contigId : i; lenSum += lenBuffer[read_c]; kmer_c += lenBuffer[read_c] - overlaplen + 1; read_c++; seqBreakers[read_c] = lenSum; indexArray[read_c] = kmer_c; //printf("seq %d start at %d\n",read_c,seqBreakers[read_c]); if (read_c == max_read_c || (lenSum + maxCtgLen) > seq_buffer_size || (kmer_c + maxCtgLen - overlaplen + 1) > buffer_size) { kmerCounter[0] += kmer_c; sendWorkSignal (2, thrdSignal); sendWorkSignal (1, thrdSignal); kmer_c = read_c = lenSum = 0; } } if (read_c) { kmerCounter[0] += kmer_c; sendWorkSignal (2, thrdSignal); sendWorkSignal (1, thrdSignal); } sendWorkSignal (3, thrdSignal); thread_wait (threads); time (&stop_t); printf ("time spent on hash reads: %ds\n", (int) (stop_t - start_t)); if (1) { unsigned long long alloCounter = 0; unsigned long long allKmerCounter = 0; for (i = 0; i < thrd_num; i++) { alloCounter += count_kmerset ((KmerSets[i])); allKmerCounter += kmerCounter[i + 1]; free ((void *) rcSeq[i + 1]); } printf ("%lli nodes allocated, %lli kmer in reads, %lli kmer processed\n", alloCounter, kmerCounter[0], allKmerCounter); } free ((void *) rcSeq); free ((void *) kmerCounter); free ((void *) seqBuffer); free ((void *) lenBuffer); free ((void *) indexArray); free ((void *) seqBreakers); free ((void *) ctgIdArray); free ((void *) kmerBuffer); free ((void *) hashBanBuffer); free ((void *) smallerBuffer); free ((void *) next_name); fclose (fp); return 1; }
/************************************************* Function: build_preArc_threaded Description: This is the main entry for building preArcs. Input: 1. arc_arr: preArc array 2. v_ht: vertex hash 3. K_size: kmer size 4. cut_off_len: cut off length 5. in_filenames_vt: input reads file names 6. thread_num: thread number Output: None. Return: None. *************************************************/ void build_preArc_threaded ( preArc_array * arc_arr, vertex_hash2 * v_ht, int K_size, int cut_off_len, vector<string> *in_filenames_vt, int thread_num ) { //create main io thread int read_buf_sz = 102400 * thrd_num_s; read_buf0 = new string[read_buf_sz]; read_buf1 = new string[read_buf_sz]; io_stat0 = 1; //must be one, if io_stat0 =0 ,the io thread will work immediately io_stat1 = 1; io_ready = 0; io_para_main io_para_mains; io_para_mains.read_buf_sz = read_buf_sz; io_para_mains.in_filenames_vt = in_filenames_vt; pthread_t io_thread; int temp; //fprintf(stderr,"Creating main io thread ...\n"); if ( ( temp = pthread_create ( &io_thread, NULL, run_io_thread_main, &io_para_mains ) ) != 0 ) { fprintf ( stderr, "ERROR: failed creating main io thread.\n" ); exit ( -1 ); } fprintf ( stderr, "1 io thread initialized.\n" ); //create work threads .. //fprintf(stderr,"Creating work threads ...\n"); pthread_t threads[thrd_num_s]; unsigned char thrdSignal[thrd_num_s + 1]; PARAMETER paras[thrd_num_s]; locks = ( pthread_spinlock_t * ) calloc ( arc_arr->array_sz, sizeof ( pthread_spinlock_t ) ); //init as unlock stat .. for ( size_t i = 0; i < arc_arr->array_sz; ++i ) { locks[i] = 1; } for ( int k = 0; k < thrd_num_s; k++ ) { thrdSignal[k + 1] = 0; paras[k].threadID = k; paras[k].mainSignal = &thrdSignal[0]; paras[k].selfSignal = &thrdSignal[k + 1]; paras[k].ht = NULL; paras[k].preArcs = arc_arr; paras[k].v_ht = v_ht; paras[k].cut_off_len = cut_off_len; paras[k].K_size = K_size; paras[k].gap = gap; } creatThrds ( threads, paras ); thrdSignal[0] = 0; //run it while ( 1 ) { sendIOWorkSignal(); while ( io_ready == 0 ) {usleep ( 1 );} if ( io_ready ) { sendWorkSignal ( 12, thrdSignal ); } if ( io_ready == 2 ) { //fprintf(stderr,"All reads have been processed!\n"); break; } } sendWorkSignal ( 3, thrdSignal ); thread_wait ( threads ); delete [] read_buf0; delete [] read_buf1; free ( ( void * ) locks ); free_vertex_hash ( v_ht ); }
boolean prlRead2HashTable ( char * libfile, char * outfile ) { long long i; char * next_name, name[256]; FILE * fo; time_t start_t, stop_t; int maxReadNum; int libNo; pthread_t threads[thrd_num]; unsigned char thrdSignal[thrd_num + 1]; PARAMETER paras[thrd_num]; boolean flag, pairs = 0; WORDFILTER = createFilter ( overlaplen ); maxReadLen = 0; maxNameLen = 256; scan_libInfo ( libfile ); alloc_pe_mem ( num_libs ); if ( !maxReadLen ) { maxReadLen = 100; } maxReadLen4all = maxReadLen; printf ( "In %s, %d libs, max seq len %d, max name len %d\n\n", libfile, num_libs, maxReadLen, maxNameLen ); next_name = ( char * ) ckalloc ( ( maxNameLen + 1 ) * sizeof ( char ) ); kmerBuffer = ( Kmer * ) ckalloc ( buffer_size * sizeof ( Kmer ) ); hashBanBuffer = ( ubyte8 * ) ckalloc ( buffer_size * sizeof ( ubyte8 ) ); prevcBuffer = ( char * ) ckalloc ( buffer_size * sizeof ( char ) ); nextcBuffer = ( char * ) ckalloc ( buffer_size * sizeof ( char ) ); maxReadNum = buffer_size / ( maxReadLen - overlaplen + 1 ); //printf("buffer size %d, max read len %d, max read num %d\n",buffer_size,maxReadLen,maxReadNum); seqBuffer = ( char ** ) ckalloc ( maxReadNum * sizeof ( char * ) ); lenBuffer = ( int * ) ckalloc ( maxReadNum * sizeof ( int ) ); indexArray = ( int * ) ckalloc ( maxReadNum * sizeof ( int ) ); for ( i = 0; i < maxReadNum; i++ ) { seqBuffer[i] = ( char * ) ckalloc ( maxReadLen * sizeof ( char ) ); } rcSeq = ( char ** ) ckalloc ( ( thrd_num + 1 ) * sizeof ( char * ) ); if ( 1 ) { kmerCounter = ( long long * ) ckalloc ( ( thrd_num + 1 ) * sizeof ( long long ) ); KmerSets = ( KmerSet ** ) ckalloc ( thrd_num * sizeof ( KmerSet * ) ); ubyte8 init_size = 1024; ubyte8 k = 0; if ( initKmerSetSize ) { init_size = ( ubyte8 ) ( ( double ) initKmerSetSize * 1024.0f * 1024.0f * 1024.0f / ( double ) thrd_num / 32 ); do { ++k; } while ( k * 0xFFFFFFLLU < init_size ); } for ( i = 0; i < thrd_num; i++ ) { //KmerSets[i] = init_kmerset(1024,0.77f); KmerSets[i] = init_kmerset ( k * 0xFFFFFFLLU, 0.77f ); thrdSignal[i + 1] = 0; paras[i].threadID = i; paras[i].mainSignal = &thrdSignal[0]; paras[i].selfSignal = &thrdSignal[i + 1]; kmerCounter[i + 1] = 0; rcSeq[i + 1] = ( char * ) ckalloc ( maxReadLen * sizeof ( char ) ); } creatThrds ( threads, paras ); } thrdSignal[0] = kmerCounter[0] = 0; time ( &start_t ); kmer_c = n_solexa = read_c = i = libNo = readNumBack = gradsCounter = 0; while ( ( flag = read1seqInLib ( seqBuffer[read_c], next_name, & ( lenBuffer[read_c] ), &libNo, pairs, 1 ) ) != 0 ) { if ( ( ++i ) % 100000000 == 0 ) { printf ( "--- %lldth reads\n", i ); } if ( lenBuffer[read_c] < 0 ) { printf ( "read len %d\n", lenBuffer[read_c] ); } if ( lenBuffer[read_c] < overlaplen + 1 ) { continue; } /* if(lenBuffer[read_c]>70) lenBuffer[read_c] = 50; else if(lenBuffer[read_c]>40) lenBuffer[read_c] = 40; */ indexArray[read_c] = kmer_c; kmer_c += lenBuffer[read_c] - overlaplen + 1; read_c++; if ( read_c == maxReadNum ) { kmerCounter[0] += kmer_c; sendWorkSignal ( 2, thrdSignal ); sendWorkSignal ( 1, thrdSignal ); kmer_c = read_c = 0; } } if ( read_c ) { kmerCounter[0] += kmer_c; sendWorkSignal ( 2, thrdSignal ); sendWorkSignal ( 1, thrdSignal ); } time ( &stop_t ); printf ( "time spent on hash reads: %ds, %lld reads processed\n", ( int ) ( stop_t - start_t ), i ); //record insert size info if ( pairs ) { if ( gradsCounter ) printf ( "%d pe insert size, the largest boundary is %lld\n\n", gradsCounter, pes[gradsCounter - 1].PE_bound ); else { printf ( "no paired reads found\n" ); } sprintf ( name, "%s.peGrads", outfile ); fo = ckopen ( name, "w" ); fprintf ( fo, "grads&num: %d\t%lld\n", gradsCounter, n_solexa ); for ( i = 0; i < gradsCounter; i++ ) { fprintf ( fo, "%d\t%lld\t%d\n", pes[i].insertS, pes[i].PE_bound, pes[i].rank ); } fclose ( fo ); } free_pe_mem(); free_libs(); if ( 1 ) { unsigned long long alloCounter = 0; unsigned long long allKmerCounter = 0; for ( i = 0; i < thrd_num; i++ ) { alloCounter += count_kmerset ( ( KmerSets[i] ) ); allKmerCounter += kmerCounter[i + 1]; free ( ( void * ) rcSeq[i + 1] ); } printf ( "%lli nodes allocated, %lli kmer in reads, %lli kmer processed\n" , alloCounter, kmerCounter[0], allKmerCounter ); } free ( ( void * ) rcSeq ); free ( ( void * ) kmerCounter ); for ( i = 0; i < maxReadNum; i++ ) { free ( ( void * ) seqBuffer[i] ); } free ( ( void * ) seqBuffer ); free ( ( void * ) lenBuffer ); free ( ( void * ) indexArray ); free ( ( void * ) kmerBuffer ); free ( ( void * ) hashBanBuffer ); free ( ( void * ) nextcBuffer ); free ( ( void * ) prevcBuffer ); free ( ( void * ) next_name ); //printf("done hashing nodes\n"); if ( deLowKmer ) { time ( &start_t ); deLowCov ( thrdSignal ); time ( &stop_t ); printf ( "time spent on delowcvgNode %ds\n", ( int ) ( stop_t - start_t ) ); } time ( &start_t ); Mark1in1outNode ( thrdSignal ); freqStat ( outfile ); time ( &stop_t ); printf ( "time spent on marking linear nodes %ds\n", ( int ) ( stop_t - start_t ) ); fflush ( stdout ); sendWorkSignal ( 3, thrdSignal ); thread_wait ( threads ); /* Kmer word = 0x21c3ca82c734c8d0; Kmer hash_ban = hash_kmer(word); int setPicker = hash_ban%thrd_num; kmer_t *node; boolean found = search_kmerset(KmerSets[setPicker], word, &node); if(!found) printf("kmer %llx not found,\n",word); else{ printf("kmer %llx, linear %d\n",word,node->linear); for(i=0;i<4;i++){ if(get_kmer_right_cov(*node,i)>0) printf("right %d, kmer %llx\n",i,nextKmer(node->seq,i)); if(get_kmer_left_cov(*node,i)>0) printf("left %d, kmer %llx\n",i,prevKmer(node->seq,i)); } } */ return 1; }