/************************************************* Function: startEdgeFromNode Description: Constructs edges from a branched node or end node. for every branch (left , right) 1. Puts the linear node into a stack 2. Checks the edge to be built form the stack are plalindrome or not 3. Builds an edge by merge the linear nodes Input: 1. ht: the graph hashtable 2. K_size: kmer size 3. fp: the file pointer for writing out edge sequences Output: None. Return: Zero. *************************************************/ static int startEdgeFromNode ( hashtable2 * ht, int K_size, bucket2 * node, FILE * fp ) { static size_t call_times; call_times++; if ( node->kmer_info.linear || node->kmer_info.deleted ) { return 0;//linear node ... } int left, right; left = count_left_edge_num ( node ); right = count_right_edge_num ( node ); if ( left == 0 && right == 0 ) { return 0; //it's a dead node } list<stacked_node2 *> stack; edge_node * t_edge = NULL, *t_next = NULL; stacked_node2 * t_stacked_node = NULL; vector<preEDGE2> loops_edges; int node_c; //for right edge t_edge = node->kmer_info.right; while ( t_edge ) { if ( t_edge->used == 1 ) { t_edge = t_edge->nxt_edge; continue; } t_stacked_node = ( stacked_node2 * ) malloc ( sizeof ( stacked_node2 ) ); t_stacked_node->node = node; t_stacked_node->is_left = 0; t_stacked_node->edge = t_edge; t_stacked_node->next = NULL; stack.push_back ( t_stacked_node ); t_edge->used = 1; stringBeads ( ht, K_size, stack, t_stacked_node, t_edge, &node_c ); process_1stack ( ht, K_size, stack, fp, loops_edges ); t_next = t_edge->nxt_edge;//because this procedure will remove the edge t_edge dislink ( ht, K_size, stack.front() ); if ( stack.size() > 2 ) { stack.pop_back();//change the stack if ( stack.back() && stack.size() > 1 ) //last but second node { dislink ( ht, K_size, stack.back() ); } } stacked_node2 * head, *tmp_node; head = stack.front(); while ( head ) { tmp_node = head; free ( tmp_node ); head = head->next; } stack.clear(); t_edge = t_next; } //for left edge t_edge = node->kmer_info.left; while ( t_edge ) { if ( t_edge->used == 1 ) { t_edge = t_edge->nxt_edge; continue; } t_stacked_node = ( stacked_node2 * ) malloc ( sizeof ( stacked_node2 ) ); t_stacked_node->node = node; t_stacked_node->is_left = 1; t_stacked_node->edge = t_edge; t_stacked_node->next = NULL; stack.push_back ( t_stacked_node ); t_edge->used = 1; stringBeads ( ht, K_size, stack, t_stacked_node, t_edge, &node_c ); // process_1stack ( ht, K_size, stack, fp, loops_edges ); t_next = t_edge->nxt_edge;//because this procedure will remove the edge t_edge dislink ( ht, K_size, stack.front() ); if ( stack.size() > 2 ) { stack.pop_back();//change the stack if ( stack.back() && stack.size() > 1 ) //last but second node { dislink ( ht, K_size, stack.back() ); } } //debug<<"before free stack"<<endl; stacked_node2 * head, *tmp_node; head = stack.front(); while ( head ) { tmp_node = head; free ( tmp_node ); head = head->next; } stack.clear(); t_edge = t_next; } if ( loops_edges.size() > 0 ) { //fprintf(stderr,"loops_edges size %llu\n",loops_edges.size()); int i, j, size; bool need_output; size = loops_edges.size(); need_output = 1; //bool debug = 0; for ( i = 0; i < size; i++ ) { string seq = * ( loops_edges[i].full_edge ); string rc_seq = revCompSeq ( seq ); /* if(seq.compare("AATTGGACGTGAGAGCAAATTGTATTGAGCATACAATTTGCTCTCACGTCCAATT") == 0) { fprintf(stderr,"in loops_edges %d %s\n",i,seq.c_str()); debug = 1; } if(seq.compare("AATTGGACGTGAGAGCAAATTGTATGCTCAATACAATTTGCTCTCACGTCCAATT") == 0) { fprintf(stderr,"in loops_edges %d %s\n",i,seq.c_str()); debug = 1; } if(debug ){ fprintf(stderr, "%d %s\n",i,seq.c_str()); fprintf(stderr, "%d %s\n",i,rc_seq.c_str()); }*/ for ( j = i + 1; j < size; j++ ) { string cur_seq = * ( loops_edges[j].full_edge ); if ( seq.compare ( cur_seq ) == 0 ) { fprintf ( stderr, "ERROR: two equal loop edge sequence from same node, this should not happen!\n" ); fprintf ( stderr, "%s\n", seq.c_str() ); exit ( -1 ); } if ( rc_seq.compare ( cur_seq ) == 0 ) { fprintf ( stderr, "INFO: two loop edge sequence are reversed complemental!\n" ); fprintf ( stderr, "%s\n", seq.c_str() ); fprintf ( stderr, "%s\n", rc_seq.c_str() ); need_output = 0; loops_edges[j].cvg += loops_edges[i].cvg; break; } } if ( need_output ) { output_1edge ( &loops_edges[i], K_size, fp ); //fprintf(stderr,"need output %d %s\n",i,seq.c_str()); } delete ( loops_edges[i].full_edge ); need_output = 1; } } return 0; }
static void merge_linearV2 ( char bal_edge, STACK * nStack, int count, FILE * fp ) { int length, char_index; preEDGE * newedge; kmer_t * del_node, *longNode; char * tightSeq, firstCh; long long symbol = 0; int len_tSeq; Kmer wordplus, bal_wordplus; ubyte8 hash_ban; KMER_PT * last_np = ( KMER_PT * ) stackPop ( nStack ); KMER_PT * second_last_np = ( KMER_PT * ) stackPop ( nStack ); KMER_PT * first_np, *second_np = NULL; KMER_PT * temp; boolean found, lastOne = 1, single = 1; int setPicker; length = count - 1; len_tSeq = length; if ( len_tSeq >= edge_length_limit ) { tightSeq = ( char * ) ckalloc ( len_tSeq * sizeof ( char ) ); } else { tightSeq = edge_seq; } char_index = length - 1; newedge = &temp_edge; newedge->to_node = last_np->kmer; newedge->length = length; newedge->bal_edge = bal_edge; tightSeq[char_index--] = lastCharInKmer ( last_np->kmer ); firstCh = firstCharInKmer ( second_last_np->kmer ); dislink2prevUncertain ( last_np->node, firstCh, last_np->isSmaller ); stackRecover ( nStack ); while ( nStack->item_c > 1 ) { second_np = ( KMER_PT * ) stackPop ( nStack ); } first_np = ( KMER_PT * ) stackPop ( nStack ); //unlink first node to the second one dislink2nextUncertain ( first_np->node, lastCharInKmer ( second_np->kmer ), first_np->isSmaller ); //printf("from %llx, to %llx\n",first_np->node->seq,last_np->node->seq); //now temp is the last node in line, out_node is the second last node in line newedge->from_node = first_np->kmer; //create a long kmer for edge with length 1 if ( length == 1 ) { nodeCounter++; wordplus = KmerPlus ( newedge->from_node, lastCharInKmer ( newedge->to_node ) ); bal_wordplus = reverseComplement ( wordplus, overlaplen + 1 ); /* Kmer temp = KmerPlus(reverseComplement(newedge->to_node,overlaplen), lastCharInKmer(reverseComplement(newedge->from_node,overlaplen))); fprintf(stderr,"(%llx %llx) (%llx %llx) (%llx %llx)\n", wordplus.high,wordplus.low,temp.high,temp.low, bal_wordplus.high,bal_wordplus.low); */ edge_c++; edgeCounter++; if ( KmerSmaller ( wordplus, bal_wordplus ) ) { hash_ban = hash_kmer ( wordplus ); setPicker = hash_ban % thrd_num; found = put_kmerset ( KmerSetsPatch[setPicker], wordplus, 4, 4, &longNode ); if ( found ) { printf ( "longNode %llx %llx already exist\n", wordplus.high, wordplus.low ); } longNode->l_links = edge_c; longNode->twin = ( unsigned char ) ( bal_edge + 1 ); } else { hash_ban = hash_kmer ( bal_wordplus ); setPicker = hash_ban % thrd_num; found = put_kmerset ( KmerSetsPatch[setPicker], bal_wordplus, 4, 4, &longNode ); if ( found ) { printf ( "longNode %llx %llx already exist\n", bal_wordplus.high, bal_wordplus.low ); } longNode->l_links = edge_c + bal_edge; longNode->twin = ( unsigned char ) ( -bal_edge + 1 ); } } else { edge_c++; edgeCounter++; } stackRecover ( nStack ); //mark all the internal nodes temp = ( KMER_PT * ) stackPop ( nStack ); while ( nStack->item_c > 1 ) { temp = ( KMER_PT * ) stackPop ( nStack ); del_node = temp->node; del_node->inEdge = 1; symbol += get_kmer_left_covs ( *del_node ); if ( temp->isSmaller ) { del_node->l_links = edge_c; del_node->twin = ( unsigned char ) ( bal_edge + 1 ); } else { del_node->l_links = edge_c + bal_edge; del_node->twin = ( unsigned char ) ( -bal_edge + 1 ); } tightSeq[char_index--] = lastCharInKmer ( temp->kmer ); } newedge->seq = tightSeq; if ( length > 1 ) { newedge->cvg = symbol / ( length - 1 ) * 10 > MaxEdgeCov ? MaxEdgeCov : symbol / ( length - 1 ) * 10; } else { newedge->cvg = 0; } output_1edge ( newedge, fp ); if ( len_tSeq >= edge_length_limit ) { free ( ( void * ) tightSeq ); } edge_c += bal_edge; if ( edge_c % 10000000 == 0 ) { printf ( "--- %d edges built\n", edge_c ); } return; }
/************************************************* Function: process_1stack Description: Processes the nodes in one stack 1. Compacts the nodes to an edge 2. Checks palindrome 3. Calculates coverage Input: 1. ht: the graph hashtable 2. K_size: kmer size 3. stack: the stack 4. fp: the file pointer for writing Output: None. Return: None. *************************************************/ static void process_1stack ( hashtable2 * ht, int K_size, list<stacked_node2 *> &stack, FILE * fp, vector<preEDGE2> &loops_edges ) { static size_t edge_c;// edge id static preEDGE2 long_edge_buf; preEDGE2 loops; int TipLenTh = 3 * K_size; //orig 100 int TipCovTh = 5; if ( stack.size() < 2 ) { fprintf ( stderr, "only %llu nodes in the stack \n", stack.size() ); exit ( -1 ); } else { //palindrome check string full_edge = stack2string ( ht, K_size, stack ); //when output skip the first kmer first stacked_node2 * test = stack.front(); bool palindrome = check_palindrome ( full_edge ); int bal_edge = !palindrome; stacked_node2 * from_node = stack.front(); stacked_node2 * to_node = stack.back(); long_edge_buf.from_node = from_node; long_edge_buf.to_node = to_node; long_edge_buf.full_edge = &full_edge; long_edge_buf.bal_edge = bal_edge; uint64_t symbol = 0; //cvg stat edge_c++; if ( stack.size() == 2 ) { long_edge_buf.cvg = from_node->edge->edge_cov; } else { stacked_node2 * nd_tmp = from_node; while ( nd_tmp && nd_tmp->edge ) { symbol += nd_tmp->edge->edge_cov * ( nd_tmp->edge->len + 1 ); nd_tmp = nd_tmp->next; } int cvg = symbol / ( full_edge.size() - K_size ); long_edge_buf.cvg = cvg; } int from_left, from_right, to_left, to_right; from_left = count_left_edge_num ( from_node->node ); from_right = count_right_edge_num ( from_node->node ); to_left = count_left_edge_num ( to_node->node ); to_right = count_right_edge_num ( to_node->node ); //tips control if ( ( ( from_left + from_right == 1 ) && ( to_left + to_right == 1 ) && ( full_edge.size() < TipLenTh ) ) || ( ( ( from_left + from_right == 1 ) || ( to_left + to_right == 1 ) ) && ( full_edge.size() < TipLenTh ) && long_edge_buf.cvg < TipCovTh ) ) //tips args { //if(full_edge.size()<TipLenTh && long_edge_buf.cvg<TipCovTh){//it's a tip or low cvg link static size_t tip_num; tip_num++; } else { //debug begin /* string bug_seq = *(long_edge_buf.full_edge); if(bug_seq.compare("AATTGGACGTGAGAGCAAATTGTATTGAGCATACAATTTGCTCTCACGTCCAATT") == 0) { fprintf(stderr,"%s\n",bug_seq.c_str()); fprintf(stderr,"from %llx to %llx \n",long_edge_buf.from_node->node,long_edge_buf.to_node->node); } if(bug_seq.compare("AATTGGACGTGAGAGCAAATTGTATGCTCAATACAATTTGCTCTCACGTCCAATT") == 0) { fprintf(stderr,"%s\n",bug_seq.c_str()); fprintf(stderr,"from %llx to %llx \n",long_edge_buf.from_node->node,long_edge_buf.to_node->node); }*/ //debug end if ( long_edge_buf.from_node->node == long_edge_buf.to_node->node ) { loops = long_edge_buf; loops.full_edge = new string ( * ( long_edge_buf.full_edge ) ); loops_edges.push_back ( loops ); } else { //output edge output_1edge ( &long_edge_buf, K_size, fp ); } } edge_c += bal_edge; } }