Example #1
0
/*************************************************
Function:
    startEdgeFromNode
Description:
    Constructs edges from a branched node or end node.
    for every branch (left , right)
    1. Puts the linear node into a stack
    2. Checks the edge to be built form the stack are plalindrome or not
    3. Builds an edge by merge the linear nodes
Input:
    1. ht:      the graph hashtable
    2. K_size:      kmer size
    3. fp:      the file pointer for writing out edge sequences
Output:
    None.
Return:
    Zero.
*************************************************/
static int startEdgeFromNode ( hashtable2 * ht, int K_size, bucket2 * node, FILE * fp )
{
	static size_t call_times;
	call_times++;

	if ( node->kmer_info.linear || node->kmer_info.deleted )
	{
		return 0;//linear node ...
	}

	int left, right;
	left = count_left_edge_num ( node );
	right = count_right_edge_num ( node );

	if ( left == 0 && right == 0 )
	{
		return 0; //it's a dead node
	}

	list<stacked_node2 *> stack;
	edge_node * t_edge = NULL, *t_next = NULL;
	stacked_node2 * t_stacked_node = NULL;
	vector<preEDGE2> loops_edges;
	int node_c;
	//for right edge
	t_edge = node->kmer_info.right;

	while ( t_edge )
	{
		if ( t_edge->used == 1 )
		{
			t_edge = t_edge->nxt_edge;
			continue;
		}

		t_stacked_node = ( stacked_node2 * ) malloc ( sizeof ( stacked_node2 ) );
		t_stacked_node->node = node;
		t_stacked_node->is_left = 0;
		t_stacked_node->edge = t_edge;
		t_stacked_node->next = NULL;
		stack.push_back ( t_stacked_node );
		t_edge->used = 1;
		stringBeads ( ht, K_size, stack, t_stacked_node, t_edge, &node_c );
		process_1stack ( ht, K_size, stack, fp, loops_edges );
		t_next = t_edge->nxt_edge;//because this procedure will remove the edge t_edge
		dislink ( ht, K_size, stack.front() );

		if ( stack.size() > 2 )
		{
			stack.pop_back();//change the stack

			if ( stack.back() && stack.size() > 1 ) //last but second node
			{
				dislink ( ht, K_size, stack.back() );
			}
		}

		stacked_node2 * head, *tmp_node;
		head = stack.front();

		while ( head )
		{
			tmp_node = head;
			free ( tmp_node );
			head = head->next;
		}

		stack.clear();
		t_edge = t_next;
	}

	//for left edge
	t_edge = node->kmer_info.left;

	while ( t_edge )
	{
		if ( t_edge->used == 1 )
		{
			t_edge = t_edge->nxt_edge;
			continue;
		}

		t_stacked_node = ( stacked_node2 * ) malloc ( sizeof ( stacked_node2 ) );
		t_stacked_node->node = node;
		t_stacked_node->is_left = 1;
		t_stacked_node->edge = t_edge;
		t_stacked_node->next = NULL;
		stack.push_back ( t_stacked_node );
		t_edge->used = 1;
		stringBeads ( ht, K_size, stack, t_stacked_node, t_edge, &node_c ); //
		process_1stack ( ht, K_size, stack, fp, loops_edges );
		t_next = t_edge->nxt_edge;//because this procedure will remove the edge t_edge
		dislink ( ht, K_size, stack.front() );

		if ( stack.size() > 2 )
		{
			stack.pop_back();//change the stack

			if ( stack.back() && stack.size() > 1 ) //last but second node
			{
				dislink ( ht, K_size, stack.back() );
			}
		}

		//debug<<"before free stack"<<endl;
		stacked_node2 * head, *tmp_node;
		head = stack.front();

		while ( head )
		{
			tmp_node = head;
			free ( tmp_node );
			head = head->next;
		}

		stack.clear();
		t_edge = t_next;
	}

	if ( loops_edges.size() > 0 )
	{
		//fprintf(stderr,"loops_edges size %llu\n",loops_edges.size());
		int i, j, size;
		bool need_output;
		size = loops_edges.size();
		need_output = 1;

		//bool debug = 0;
		for ( i = 0; i < size; i++ )
		{
			string seq = * ( loops_edges[i].full_edge );
			string rc_seq = revCompSeq ( seq );
			/*
			if(seq.compare("AATTGGACGTGAGAGCAAATTGTATTGAGCATACAATTTGCTCTCACGTCCAATT") == 0) {
			                    fprintf(stderr,"in loops_edges %d %s\n",i,seq.c_str());
			    debug = 1;
			            }

			            if(seq.compare("AATTGGACGTGAGAGCAAATTGTATGCTCAATACAATTTGCTCTCACGTCCAATT") == 0) {
			                    fprintf(stderr,"in loops_edges %d %s\n",i,seq.c_str());
			    debug = 1;
			            }

			if(debug ){
			    fprintf(stderr, "%d %s\n",i,seq.c_str());
			    fprintf(stderr, "%d %s\n",i,rc_seq.c_str());
			}*/

			for ( j = i + 1; j < size; j++ )
			{
				string cur_seq = * ( loops_edges[j].full_edge );

				if ( seq.compare ( cur_seq ) == 0 )
				{
					fprintf ( stderr, "ERROR: two equal loop edge sequence from same node, this should not happen!\n" );
					fprintf ( stderr, "%s\n", seq.c_str() );
					exit ( -1 );
				}

				if ( rc_seq.compare ( cur_seq ) == 0 )
				{
					fprintf ( stderr, "INFO: two loop edge sequence are reversed complemental!\n" );
					fprintf ( stderr, "%s\n", seq.c_str() );
					fprintf ( stderr, "%s\n", rc_seq.c_str() );
					need_output = 0;
					loops_edges[j].cvg += loops_edges[i].cvg;
					break;
				}
			}

			if ( need_output )
			{
				output_1edge ( &loops_edges[i], K_size, fp );
				//fprintf(stderr,"need output %d %s\n",i,seq.c_str());
			}

			delete ( loops_edges[i].full_edge );
			need_output = 1;
		}
	}

	return 0;
}
Example #2
0
static void merge_linearV2 ( char bal_edge, STACK * nStack, int count, FILE * fp )
{
	int length, char_index;
	preEDGE * newedge;
	kmer_t * del_node, *longNode;
	char * tightSeq, firstCh;
	long long symbol = 0;
	int len_tSeq;
	Kmer wordplus, bal_wordplus;
	ubyte8 hash_ban;
	KMER_PT * last_np = ( KMER_PT * ) stackPop ( nStack );
	KMER_PT * second_last_np = ( KMER_PT * ) stackPop ( nStack );
	KMER_PT * first_np, *second_np = NULL;
	KMER_PT * temp;
	boolean found, lastOne = 1, single = 1;
	int setPicker;
	length = count - 1;
	len_tSeq = length;

	if ( len_tSeq >= edge_length_limit )
		{ tightSeq = ( char * ) ckalloc ( len_tSeq * sizeof ( char ) ); }
	else
		{ tightSeq = edge_seq; }

	char_index = length - 1;
	newedge = &temp_edge;
	newedge->to_node = last_np->kmer;
	newedge->length = length;
	newedge->bal_edge = bal_edge;
	tightSeq[char_index--] = lastCharInKmer ( last_np->kmer );
	firstCh = firstCharInKmer ( second_last_np->kmer );
	dislink2prevUncertain ( last_np->node, firstCh, last_np->isSmaller );
	stackRecover ( nStack );

	while ( nStack->item_c > 1 )
	{
		second_np = ( KMER_PT * ) stackPop ( nStack );
	}

	first_np = ( KMER_PT * ) stackPop ( nStack );
	//unlink first node to the second one
	dislink2nextUncertain ( first_np->node, lastCharInKmer ( second_np->kmer ), first_np->isSmaller );
	//printf("from %llx, to %llx\n",first_np->node->seq,last_np->node->seq);
	//now temp is the last node in line, out_node is the second last node in line
	newedge->from_node = first_np->kmer;

	//create a long kmer for edge with length 1
	if ( length == 1 )
	{
		nodeCounter++;
		wordplus = KmerPlus ( newedge->from_node, lastCharInKmer ( newedge->to_node ) );
		bal_wordplus = reverseComplement ( wordplus, overlaplen + 1 );
		/*
		Kmer temp = KmerPlus(reverseComplement(newedge->to_node,overlaplen),
		            lastCharInKmer(reverseComplement(newedge->from_node,overlaplen)));
		fprintf(stderr,"(%llx %llx) (%llx %llx) (%llx %llx)\n",
		        wordplus.high,wordplus.low,temp.high,temp.low,
		            bal_wordplus.high,bal_wordplus.low);
		*/
		edge_c++;
		edgeCounter++;

		if ( KmerSmaller ( wordplus, bal_wordplus ) )
		{
			hash_ban = hash_kmer ( wordplus );
			setPicker = hash_ban % thrd_num;
			found = put_kmerset ( KmerSetsPatch[setPicker], wordplus, 4, 4, &longNode );

			if ( found )
				{ printf ( "longNode %llx %llx already exist\n", wordplus.high, wordplus.low ); }

			longNode->l_links = edge_c;
			longNode->twin = ( unsigned char ) ( bal_edge + 1 );
		}
		else
		{
			hash_ban = hash_kmer ( bal_wordplus );
			setPicker = hash_ban % thrd_num;
			found = put_kmerset ( KmerSetsPatch[setPicker], bal_wordplus, 4, 4, &longNode );

			if ( found )
				{ printf ( "longNode %llx %llx already exist\n", bal_wordplus.high, bal_wordplus.low ); }

			longNode->l_links = edge_c + bal_edge;
			longNode->twin = ( unsigned char ) ( -bal_edge + 1 );
		}
	}
	else
	{
		edge_c++;
		edgeCounter++;
	}

	stackRecover ( nStack );
	//mark all  the internal nodes
	temp = ( KMER_PT * ) stackPop ( nStack );

	while ( nStack->item_c > 1 )
	{
		temp = ( KMER_PT * ) stackPop ( nStack );
		del_node = temp->node;
		del_node->inEdge = 1;
		symbol += get_kmer_left_covs ( *del_node );

		if ( temp->isSmaller )
		{
			del_node->l_links = edge_c;
			del_node->twin = ( unsigned char ) ( bal_edge + 1 );
		}
		else
		{
			del_node->l_links = edge_c + bal_edge;
			del_node->twin = ( unsigned char ) ( -bal_edge + 1 );
		}

		tightSeq[char_index--] = lastCharInKmer ( temp->kmer );
	}

	newedge->seq = tightSeq;

	if ( length > 1 )
		{ newedge->cvg = symbol / ( length - 1 ) * 10 > MaxEdgeCov ? MaxEdgeCov : symbol / ( length - 1 ) * 10; }
	else
		{ newedge->cvg = 0; }

	output_1edge ( newedge, fp );

	if ( len_tSeq >= edge_length_limit )
		{ free ( ( void * ) tightSeq ); }

	edge_c += bal_edge;

	if ( edge_c % 10000000 == 0 )
		{ printf ( "--- %d edges built\n", edge_c ); }

	return;
}
Example #3
0
/*************************************************
Function:
    process_1stack
Description:
    Processes the nodes in one stack
    1. Compacts the nodes to an edge
    2. Checks palindrome
    3. Calculates coverage
Input:
    1. ht:      the graph hashtable
    2. K_size:      kmer size
    3. stack:       the stack
    4. fp:      the file pointer for writing
Output:
    None.
Return:
    None.
*************************************************/
static void process_1stack ( hashtable2 * ht, int K_size, list<stacked_node2 *> &stack, FILE * fp, vector<preEDGE2> &loops_edges )
{
	static size_t edge_c;// edge id
	static preEDGE2 long_edge_buf;
	preEDGE2 loops;
	int TipLenTh = 3 * K_size; //orig 100
	int TipCovTh = 5;

	if ( stack.size() < 2 )
	{
		fprintf ( stderr, "only %llu nodes in the stack \n", stack.size() );
		exit ( -1 );
	}
	else
	{
		//palindrome check
		string full_edge = stack2string ( ht, K_size, stack ); //when output  skip the first kmer first
		stacked_node2 * test = stack.front();
		bool palindrome = check_palindrome ( full_edge );
		int bal_edge = !palindrome;
		stacked_node2 * from_node = stack.front();
		stacked_node2 * to_node = stack.back();
		long_edge_buf.from_node = from_node;
		long_edge_buf.to_node = to_node;
		long_edge_buf.full_edge = &full_edge;
		long_edge_buf.bal_edge = bal_edge;
		uint64_t symbol = 0; //cvg stat
		edge_c++;

		if ( stack.size() == 2 )
		{
			long_edge_buf.cvg = from_node->edge->edge_cov;
		}
		else
		{
			stacked_node2 * nd_tmp = from_node;

			while ( nd_tmp && nd_tmp->edge )
			{
				symbol += nd_tmp->edge->edge_cov * ( nd_tmp->edge->len + 1 );
				nd_tmp = nd_tmp->next;
			}

			int cvg = symbol / ( full_edge.size() - K_size );
			long_edge_buf.cvg = cvg;
		}

		int from_left, from_right, to_left, to_right;
		from_left = count_left_edge_num ( from_node->node );
		from_right = count_right_edge_num ( from_node->node );
		to_left = count_left_edge_num ( to_node->node );
		to_right = count_right_edge_num ( to_node->node );

		//tips control

		if ( ( ( from_left + from_right == 1 ) && ( to_left + to_right == 1 ) && ( full_edge.size() < TipLenTh ) )
		        || ( ( ( from_left + from_right == 1 ) || ( to_left + to_right == 1 ) )
		             && ( full_edge.size() < TipLenTh ) && long_edge_buf.cvg < TipCovTh ) ) //tips args
		{
			//if(full_edge.size()<TipLenTh && long_edge_buf.cvg<TipCovTh){//it's a tip or low cvg link
			static size_t tip_num;
			tip_num++;
		}
		else
		{
			//debug begin
			/*
			string bug_seq = *(long_edge_buf.full_edge);
			if(bug_seq.compare("AATTGGACGTGAGAGCAAATTGTATTGAGCATACAATTTGCTCTCACGTCCAATT") == 0) {
			    fprintf(stderr,"%s\n",bug_seq.c_str());
			    fprintf(stderr,"from %llx to %llx \n",long_edge_buf.from_node->node,long_edge_buf.to_node->node);

			}

			if(bug_seq.compare("AATTGGACGTGAGAGCAAATTGTATGCTCAATACAATTTGCTCTCACGTCCAATT") == 0) {
			                    fprintf(stderr,"%s\n",bug_seq.c_str());
			                    fprintf(stderr,"from %llx to %llx \n",long_edge_buf.from_node->node,long_edge_buf.to_node->node);

			            }*/

			//debug end
			if ( long_edge_buf.from_node->node == long_edge_buf.to_node->node )
			{
				loops = long_edge_buf;
				loops.full_edge = new string ( * ( long_edge_buf.full_edge ) );
				loops_edges.push_back ( loops );
			}
			else
			{
				//output edge
				output_1edge ( &long_edge_buf, K_size, fp );
			}
		}

		edge_c += bal_edge;
	}
}