Beispiel #1
0
/*************************************************
Function:
    lastKmer
Description:
    Searches the node that a node's kmer-edge end with.
Input:
    1. ht:      the graph hashtable
    2. K_size:      kmer size
    3. node:        the node whose kmer-edge will be searched
    4. edge:        the kmer-edge
    5. is_left:     whether the kmer-edge on the node's left side
Output:
    1. smaller:     whether the searched result, a kmer is smaller than its reversed complement
Return:
    A pointer to the found node.
    Null if not found.
*************************************************/
static bucket2 * lastKmer ( hashtable2 * ht, int K_size, bucket2 * node, edge_node * edge, int is_left, int & smaller ) //NEW
{
	if ( !node || !edge ) { return NULL; }

	kmer_t2 t_kmer, f_kmer;
	t_kmer = node->kmer_t2;
	kmer_t2 edge_seq;
	memset ( edge_seq.kmer, 0, sizeof ( edge_seq ) );
	( edge_seq.kmer ) [sizeof ( edge_seq ) / sizeof ( uint64_t ) - 1] = edge->edge;
	int edge_len = edge->len + 1;

	if ( edge_len > K_size )
	{
		fprintf ( stderr, "ERROR: g value should be no great than kmer size!\n" );
		exit ( -1 );
	}

	kmer_t2 KMER_FILTER;
	initKmerFilter ( K_size, &KMER_FILTER );

	if ( is_left ) //left edge
	{
		kmerMoveRight ( &t_kmer, edge_len );
		kmerMoveLeft ( &edge_seq, K_size - edge_len );
		kmerOr ( &t_kmer, &edge_seq );
		kmerAnd ( &t_kmer, &KMER_FILTER );
	}
	else
	{
		kmerMoveLeft ( &t_kmer, edge_len );
		kmerOr ( &t_kmer, &edge_seq );
		kmerAnd ( &t_kmer, &KMER_FILTER );
	}

	f_kmer = t_kmer;
	reverseCompKmer ( &f_kmer, K_size );

	if ( kmerCompare ( &t_kmer, &f_kmer ) > 0 )
	{
		t_kmer = f_kmer;
		smaller = 0;
	}
	else { smaller = 1; }

	return search_kmer ( ht, &t_kmer );
}
/*************************************************
Function:
    process_1read_preArc
Description:
    This is the core function for building preArcs.
    1. Chops one read into kmers.
    2. Searches the kmers in vertex hash.
    3. Aligns the vertex's kmer-edge sequences to the read sequence on both sides.
    4. Constructs preArcs according the mapping result on both sides of a vertex.

    @since r53:
    5. add -R support, solves tiny repeat.
Input:
    1. arc_arr:     preArc array
    2. locks:       locks array
    3. v_ht:        vertex hash
    4. K_size:      kmer size
    5. cut_off_len:     cut off length
    6. read:        read
Output:
    None.
Return:
    None.
*************************************************/
void process_1read_preArc ( preArc_array * arc_arr, pthread_spinlock_t * locks, int thread_id, vertex_hash2 * v_ht, int K_size, int cut_off_len, const char * read )
{
	const int BUFF_LEN = 1024;
	kmer_t2 kmers[BUFF_LEN];
	int kmer_array_len = cut_off_len - K_size + 1;
	int kmer_num ;
	vertex2 * v_tmp;
	edge_starter2 * e_tmp;
	size_t left_id;
	size_t right_id;
	int left_found = 0, right_found = 0;
	int edge_len;
	//update
	//int map_len;
	//int shortest_maplen = 0;
	//add for -R solving tiny repeats
	unsigned int path[128];
	unsigned int counter = 0;
	//int read_len,i=0;
	int read_len = strlen ( read );
	/*
	while(read[i]!='\0'){
	    i++;
	}
	read_len = i;
	//read_len = strlen(read);
	if(read[read_len-1]=='\n'){
	    read[read_len-1]='\0';
	    read_len--;
	}*/

	if ( read_len > cut_off_len ) { read_len = cut_off_len; }

	kmer_array_len = read_len - K_size + 1;
	chop_kmers ( read, read_len, K_size, kmers, kmer_array_len, kmer_num );

	for ( int i = 1; i < kmer_num - 1; ++i ) //search every kmer exclude the begin and end kmer
	{
		v_tmp = search_vertex ( v_ht, &kmers[i] );

		if ( v_tmp ) //found
		{
			//search left edge kmer got left id
			e_tmp = v_tmp->left;

			while ( e_tmp )
			{
				edge_len = e_tmp->len;

				if ( edge_len <= i )
				{
					if ( kmerCompare ( & ( kmers[i - edge_len] ), & ( e_tmp->edge_kmer ) ) == 0 )
					{
						left_id = e_tmp->edge_id;

						if ( left_found )
						{
							fprintf ( stderr, "ERROR: left edge id found already !new found id %llu \n", left_id );
							fprintf ( stderr, "i:%d ,edge_len:%d\n", i, edge_len );
							printKmerSeq ( & ( kmers[i - edge_len] ), K_size, stderr );
							printKmerSeq ( & ( e_tmp->edge_kmer ), K_size, stderr );
							exit ( 1 );
						};

						left_found = 1;

						break;
					}
				}
				else
				{
					kmer_t2 read_edge = kmers[0];

					if ( K_size > i )
					{
						kmerMoveRight ( &read_edge, K_size - i );
					}

					kmer_t2 KMER_FILTER;
					initKmerFilter ( i, &KMER_FILTER );
					kmer_t2 edge_kmer = e_tmp->edge_kmer;

					if ( K_size > edge_len )
					{
						kmerMoveRight ( &edge_kmer, K_size - edge_len );
					}

					kmerAnd ( &read_edge, &KMER_FILTER );
					kmerAnd ( &edge_kmer, &KMER_FILTER );

					if ( kmerCompare ( &read_edge, &edge_kmer ) == 0 )
					{
						left_found++;
						left_id = e_tmp->edge_id;

						if ( left_found == 2 )
						{
							//debug_build<<"can't distinct which left edge\n";
							break;
						}
					}
				}

				e_tmp = e_tmp->next;
			}

			//update maplen_control
			/*
			if(edge_len >= shortest_maplen){
			    if(map_len < shortest_maplen) left_found = 0;
			}else{
			    if(map_len != edge_len) left_found = 0;
			}*/

			if ( left_found != 1 ) {left_found = 0; right_found = 0; continue;} //not found or multi found

			//todo : aln  if  left_found = 0  ... find the best
			//search right edge kmer got right id
			e_tmp = v_tmp->right;

			while ( e_tmp )
			{
				edge_len = e_tmp->len;

				if ( edge_len <= kmer_num - 1 - i )
				{
					if ( kmerCompare ( & ( kmers[i + edge_len] ), & ( e_tmp->edge_kmer ) ) == 0 )
					{
						right_id = e_tmp->edge_id;

						if ( right_found )
						{
							fprintf ( stderr, "ERROR: right edge id found already, new found id %llu !\n", right_id );
							fprintf ( stderr, "i:%d ,edge_len:%d\n", i, edge_len );
							printKmerSeq ( & ( kmers[i + edge_len] ), K_size, stderr );
							printKmerSeq ( & ( e_tmp->edge_kmer ), K_size, stderr );
							exit ( 1 );
						};

						right_found = 1;

						break;
					}
				}
				else
				{
					int read_edge_len = ( kmer_num - 1 - i );
					kmer_t2 KMER_FILTER;
					initKmerFilter ( read_edge_len, &KMER_FILTER );
					kmer_t2 read_edge = kmers[kmer_num - 1];
					kmerAnd ( &read_edge, &KMER_FILTER );
					kmer_t2 edge_kmer = e_tmp->edge_kmer;

					if ( edge_len > read_edge_len )
					{
						kmerMoveRight ( &edge_kmer, ( edge_len - read_edge_len ) );
					}

					kmerAnd ( &edge_kmer, &KMER_FILTER );

					if ( kmerCompare ( &read_edge, &edge_kmer ) == 0 )
					{
						right_found++;
						right_id = e_tmp->edge_id;

						if ( right_found == 2 )
						{
							//debug_build<<"can't distinct which right edge\n";
							break;
						}
					}
				}

				e_tmp = e_tmp->next;
			}

			//update map_len control
			/*
			if(edge_len >= shortest_maplen){
			    if(map_len < shortest_maplen) right_found = 0;
			}else{
			    if(map_len != edge_len) right_found = 0;
			}*/

			if ( right_found != 1 ) {left_found = 0; right_found = 0; continue;}

			//todo : aln  if  right_found = 0  ... find the best
			//if(left_found == 1 && right_found ==1)
			//store this preArc
			//preArc_array *arc_arr
			put_preArc_threaded ( arc_arr, locks, left_id, right_id, 1 );

			//constructing the path ...
			if ( solve )
			{
				if ( counter == 0 )
				{
					counter = 2;
					path[1] = left_id;
					path[2] = right_id;
				}
				else if ( counter <= 100 )
				{
					if ( path[counter] == left_id )
					{
						path[++counter] = right_id;
					}
					else
					{
						path[++counter] = left_id;
						path[++counter] = right_id;
					}
				}
			}

			//end ...
			left_found = 0;
			right_found = 0;
		}
	}

	//add to path buffer , if full filled ,output it
	if ( solve )
	{
		if ( counter >= 3 && counter <= 100 )
		{
			path[0] = counter;
			int tmp = is_full ( path_buffer[thread_id] );

			if ( tmp == 1 )
			{
				//output it
				output_edge_path_buffer_locked ( path_buffer[thread_id], path_fp, &file_lock );
			}
			else if ( tmp == -1 )
			{
				//error status
				fprintf ( stderr, "ERROR: path buffer overflow!! system exit .\n" );
				exit ( -1 );
			}

			put_path_2_buffer ( path_buffer[thread_id],  path );
		}
	}
}