/************************************************* Function: lastKmer Description: Searches the node that a node's kmer-edge end with. Input: 1. ht: the graph hashtable 2. K_size: kmer size 3. node: the node whose kmer-edge will be searched 4. edge: the kmer-edge 5. is_left: whether the kmer-edge on the node's left side Output: 1. smaller: whether the searched result, a kmer is smaller than its reversed complement Return: A pointer to the found node. Null if not found. *************************************************/ static bucket2 * lastKmer ( hashtable2 * ht, int K_size, bucket2 * node, edge_node * edge, int is_left, int & smaller ) //NEW { if ( !node || !edge ) { return NULL; } kmer_t2 t_kmer, f_kmer; t_kmer = node->kmer_t2; kmer_t2 edge_seq; memset ( edge_seq.kmer, 0, sizeof ( edge_seq ) ); ( edge_seq.kmer ) [sizeof ( edge_seq ) / sizeof ( uint64_t ) - 1] = edge->edge; int edge_len = edge->len + 1; if ( edge_len > K_size ) { fprintf ( stderr, "ERROR: g value should be no great than kmer size!\n" ); exit ( -1 ); } kmer_t2 KMER_FILTER; initKmerFilter ( K_size, &KMER_FILTER ); if ( is_left ) //left edge { kmerMoveRight ( &t_kmer, edge_len ); kmerMoveLeft ( &edge_seq, K_size - edge_len ); kmerOr ( &t_kmer, &edge_seq ); kmerAnd ( &t_kmer, &KMER_FILTER ); } else { kmerMoveLeft ( &t_kmer, edge_len ); kmerOr ( &t_kmer, &edge_seq ); kmerAnd ( &t_kmer, &KMER_FILTER ); } f_kmer = t_kmer; reverseCompKmer ( &f_kmer, K_size ); if ( kmerCompare ( &t_kmer, &f_kmer ) > 0 ) { t_kmer = f_kmer; smaller = 0; } else { smaller = 1; } return search_kmer ( ht, &t_kmer ); }
/************************************************* Function: process_1read_preArc Description: This is the core function for building preArcs. 1. Chops one read into kmers. 2. Searches the kmers in vertex hash. 3. Aligns the vertex's kmer-edge sequences to the read sequence on both sides. 4. Constructs preArcs according the mapping result on both sides of a vertex. @since r53: 5. add -R support, solves tiny repeat. Input: 1. arc_arr: preArc array 2. locks: locks array 3. v_ht: vertex hash 4. K_size: kmer size 5. cut_off_len: cut off length 6. read: read Output: None. Return: None. *************************************************/ void process_1read_preArc ( preArc_array * arc_arr, pthread_spinlock_t * locks, int thread_id, vertex_hash2 * v_ht, int K_size, int cut_off_len, const char * read ) { const int BUFF_LEN = 1024; kmer_t2 kmers[BUFF_LEN]; int kmer_array_len = cut_off_len - K_size + 1; int kmer_num ; vertex2 * v_tmp; edge_starter2 * e_tmp; size_t left_id; size_t right_id; int left_found = 0, right_found = 0; int edge_len; //update //int map_len; //int shortest_maplen = 0; //add for -R solving tiny repeats unsigned int path[128]; unsigned int counter = 0; //int read_len,i=0; int read_len = strlen ( read ); /* while(read[i]!='\0'){ i++; } read_len = i; //read_len = strlen(read); if(read[read_len-1]=='\n'){ read[read_len-1]='\0'; read_len--; }*/ if ( read_len > cut_off_len ) { read_len = cut_off_len; } kmer_array_len = read_len - K_size + 1; chop_kmers ( read, read_len, K_size, kmers, kmer_array_len, kmer_num ); for ( int i = 1; i < kmer_num - 1; ++i ) //search every kmer exclude the begin and end kmer { v_tmp = search_vertex ( v_ht, &kmers[i] ); if ( v_tmp ) //found { //search left edge kmer got left id e_tmp = v_tmp->left; while ( e_tmp ) { edge_len = e_tmp->len; if ( edge_len <= i ) { if ( kmerCompare ( & ( kmers[i - edge_len] ), & ( e_tmp->edge_kmer ) ) == 0 ) { left_id = e_tmp->edge_id; if ( left_found ) { fprintf ( stderr, "ERROR: left edge id found already !new found id %llu \n", left_id ); fprintf ( stderr, "i:%d ,edge_len:%d\n", i, edge_len ); printKmerSeq ( & ( kmers[i - edge_len] ), K_size, stderr ); printKmerSeq ( & ( e_tmp->edge_kmer ), K_size, stderr ); exit ( 1 ); }; left_found = 1; break; } } else { kmer_t2 read_edge = kmers[0]; if ( K_size > i ) { kmerMoveRight ( &read_edge, K_size - i ); } kmer_t2 KMER_FILTER; initKmerFilter ( i, &KMER_FILTER ); kmer_t2 edge_kmer = e_tmp->edge_kmer; if ( K_size > edge_len ) { kmerMoveRight ( &edge_kmer, K_size - edge_len ); } kmerAnd ( &read_edge, &KMER_FILTER ); kmerAnd ( &edge_kmer, &KMER_FILTER ); if ( kmerCompare ( &read_edge, &edge_kmer ) == 0 ) { left_found++; left_id = e_tmp->edge_id; if ( left_found == 2 ) { //debug_build<<"can't distinct which left edge\n"; break; } } } e_tmp = e_tmp->next; } //update maplen_control /* if(edge_len >= shortest_maplen){ if(map_len < shortest_maplen) left_found = 0; }else{ if(map_len != edge_len) left_found = 0; }*/ if ( left_found != 1 ) {left_found = 0; right_found = 0; continue;} //not found or multi found //todo : aln if left_found = 0 ... find the best //search right edge kmer got right id e_tmp = v_tmp->right; while ( e_tmp ) { edge_len = e_tmp->len; if ( edge_len <= kmer_num - 1 - i ) { if ( kmerCompare ( & ( kmers[i + edge_len] ), & ( e_tmp->edge_kmer ) ) == 0 ) { right_id = e_tmp->edge_id; if ( right_found ) { fprintf ( stderr, "ERROR: right edge id found already, new found id %llu !\n", right_id ); fprintf ( stderr, "i:%d ,edge_len:%d\n", i, edge_len ); printKmerSeq ( & ( kmers[i + edge_len] ), K_size, stderr ); printKmerSeq ( & ( e_tmp->edge_kmer ), K_size, stderr ); exit ( 1 ); }; right_found = 1; break; } } else { int read_edge_len = ( kmer_num - 1 - i ); kmer_t2 KMER_FILTER; initKmerFilter ( read_edge_len, &KMER_FILTER ); kmer_t2 read_edge = kmers[kmer_num - 1]; kmerAnd ( &read_edge, &KMER_FILTER ); kmer_t2 edge_kmer = e_tmp->edge_kmer; if ( edge_len > read_edge_len ) { kmerMoveRight ( &edge_kmer, ( edge_len - read_edge_len ) ); } kmerAnd ( &edge_kmer, &KMER_FILTER ); if ( kmerCompare ( &read_edge, &edge_kmer ) == 0 ) { right_found++; right_id = e_tmp->edge_id; if ( right_found == 2 ) { //debug_build<<"can't distinct which right edge\n"; break; } } } e_tmp = e_tmp->next; } //update map_len control /* if(edge_len >= shortest_maplen){ if(map_len < shortest_maplen) right_found = 0; }else{ if(map_len != edge_len) right_found = 0; }*/ if ( right_found != 1 ) {left_found = 0; right_found = 0; continue;} //todo : aln if right_found = 0 ... find the best //if(left_found == 1 && right_found ==1) //store this preArc //preArc_array *arc_arr put_preArc_threaded ( arc_arr, locks, left_id, right_id, 1 ); //constructing the path ... if ( solve ) { if ( counter == 0 ) { counter = 2; path[1] = left_id; path[2] = right_id; } else if ( counter <= 100 ) { if ( path[counter] == left_id ) { path[++counter] = right_id; } else { path[++counter] = left_id; path[++counter] = right_id; } } } //end ... left_found = 0; right_found = 0; } } //add to path buffer , if full filled ,output it if ( solve ) { if ( counter >= 3 && counter <= 100 ) { path[0] = counter; int tmp = is_full ( path_buffer[thread_id] ); if ( tmp == 1 ) { //output it output_edge_path_buffer_locked ( path_buffer[thread_id], path_fp, &file_lock ); } else if ( tmp == -1 ) { //error status fprintf ( stderr, "ERROR: path buffer overflow!! system exit .\n" ); exit ( -1 ); } put_path_2_buffer ( path_buffer[thread_id], path ); } } }