/* scan through all columns and identify the set within threshold, * "fuzziness" of the block is controlled by TOLERANCE (-c) */ void scan_block (struct dyStack *gene_set, Block *b_ptr) { int i, j; int block_rows, cur_rows; block_rows = cur_rows = dsSize(gene_set); int k; for (j = 0; j < cols; j++) for (k=0; k<sigma; k++) profile[j][k] = 0; for (j = 0; j< cur_rows ; j++) seed_update(arr_c[dsItem(gene_set,j)]); int btolerance = ceil(po->TOLERANCE* block_rows); for (j = 0; j < cols; j++) { /* See if this column satisfies tolerance */ /* here i start from 1 because symbols[0]=0 */ for (i = 1; i < sigma; i++) { if ((profile[j][i] >= btolerance)) { dsPush(b_ptr->conds, j); break; } } } b_ptr->block_cols = dsSize(b_ptr->conds); }
/* Core algorithm */ int cluster (FILE *fw, Edge **el, int n) { int block_id = 0; Block **bb; int allocated = po->SCH_BLOCK; AllocArray(bb, allocated); Edge *e; Block *b; struct dyStack *genes, *scores, *b_genes, *allincluster; int i, j, k, components; AllocArray(profile, cols); for (j = 0; j < cols; j++) AllocArray(profile[j], sigma); genes = dsNew(rows); scores = dsNew(rows); allincluster = dsNew(rows); bool *candidates; AllocArray(candidates, rows); e = *el; i = 0; while (i++ < n) { /*printf ("%d\n",i);*/ e = *el++; /* check if both genes already enumerated in previous blocks */ bool flag = TRUE; /* speed up the program if the rows bigger than 200 */ if (rows > 250) { if ( isInStack(allincluster,e->gene_one) && isInStack(allincluster,e->gene_two) ) flag = FALSE; else if ((po->IS_TFname)&&(e->gene_one!= TFindex)&&(e->gene_two!=TFindex)) flag = FALSE; else if ((po->IS_list)&&(!sublist[e->gene_one] || !sublist[e->gene_two])) flag =FALSE; } else { flag = check_seed(e, bb, block_id); if ((po->IS_TFname)&&(e->gene_one!= TFindex)&&(e->gene_two!=TFindex)) flag = FALSE; if ((po->IS_list)&&(!sublist[e->gene_one] || !sublist[e->gene_two])) flag = FALSE; } if (!flag) continue; for (j = 0; j < cols; j++) for (k = 0; k < sigma; k++) profile[j][k] = 0; /*you must allocate a struct if you want to use the pointers related to it*/ AllocVar(b); /*initial the b->score*/ b->score = MIN(2, e->score); /* initialize the stacks genes and scores */ int ii; dsClear(genes); dsClear(scores); for(ii = 0; ii < rows; ii ++) { dsPush(genes,-1); dsPush(scores,-1); } dsClear(genes); dsClear(scores); /*printf ("%d\t%d\n",e->gene_one,e->gene_two);*/ dsPush(genes, e->gene_one); dsPush(genes, e->gene_two); dsPush(scores, 1); dsPush(scores, b->score); /* branch-and-cut condition for seed expansion */ int cand_threshold = floor(po->COL_WIDTH * po->TOLERANCE); if (cand_threshold < 2) cand_threshold = 2; /* maintain a candidate list to avoid looping through all rows */ for (j = 0; j < rows; j++) candidates[j] = TRUE; candidates[e->gene_one] = candidates[e->gene_two] = FALSE; components = 2; /* expansion step, generate a bicluster without noise */ block_init(e, b, genes, scores, candidates, cand_threshold, &components, allincluster); /* track back to find the genes by which we get the best score*/ for(k = 0; k < components; k++) { /* printf ("******%d\t%d\n",dsItem(scores,k),b->score);*/ if ((dsItem(scores,k) == b->score)&&(dsItem(scores,k+1)!= b->score)) break; } components = k + 1; /*printf ("%d",components);*/ int ki; for (ki=0; ki < rows; ki++) candidates[ki] = TRUE; for (ki=0; ki < components - 1 ; ki++) { seed_update(arr_c[dsItem(genes,ki)]); candidates[dsItem(genes,ki)] = FALSE; } candidates[dsItem(genes,k)] = FALSE; genes->top = k ; int cnt = 0; bool *colcand; AllocArray(colcand, cols); for(ki = 0; ki < cols; ki++) colcand[ki] = FALSE; /* add columns satisfy the conservative r */ seed_current_modify(arr_c[dsItem(genes,k)], colcand, &cnt, components); /* add some new possible genes */ int m_cnt=0; continuous KL_score=0; discrete *sub_array; for ( ki = 0; ki < rows; ki++) { if (po->IS_list && !sublist[ki]) continue; m_cnt = intersect_row(colcand, arr_c[dsItem(genes,0)], arr_c[ki]); if ( candidates[ki] && (m_cnt >= floor(cnt* po->TOLERANCE)) ) { sub_array = get_intersect_row(colcand,arr_c[dsItem(genes,0)],arr_c[ki],m_cnt); KL_score = get_KL (sub_array, arr_c[ki], m_cnt, cols); /*printf ("%d\t%.2f\n",m_cnt,KL_score);*/ if (KL_score>=b->significance * po->TOLERANCE) { dsPush(genes,ki); components++; candidates[ki] = FALSE; } } } b->block_rows_pre = components; /* add genes that negative regulated to the consensus */ for ( ki = 0; ki < rows; ki++) { if (po->IS_list && !sublist[ki]) continue; m_cnt = reverse_row(colcand, arr_c[dsItem(genes,0)], arr_c[ki]); if ( candidates[ki] && (m_cnt >= floor(cnt * po->TOLERANCE)) ) { sub_array = get_intersect_reverse_row(colcand,arr_c[dsItem(genes,0)],arr_c[ki],m_cnt); KL_score = get_KL (sub_array, arr_c[ki], m_cnt, cols); if (KL_score>=b->significance * po->TOLERANCE) { dsPush(genes,ki); components++; candidates[ki] = FALSE; } } } free(colcand); /* save the current cluster*/ b_genes = dsNew(b->block_rows_pre); for (ki = 0; ki < b->block_rows_pre; ki++) dsPush(b_genes, dsItem(genes,ki)); /* store gene arrays inside block */ b->genes = dsNew(components); b->conds = dsNew(cols); scan_block(b_genes, b); if (b->block_cols == 0) continue; b->block_rows = components; b->score = b->score; /* b->score = b->block_rows * b->block_cols; */ dsClear(b->genes); for ( ki=0; ki < components; ki++) dsPush(b->genes,dsItem(genes,ki)); for(ki = 0; ki < components; ki++) if(!isInStack(allincluster, dsItem(genes,ki))) dsPush(allincluster,dsItem(genes,ki)); /*save the current block b to the block list bb so that we can sort the blocks by their score*/ bb[block_id++] = b; /* reaching the results number limit */ if (block_id == po->SCH_BLOCK) break; verboseDot(); } /* writes character to the current position in the standard output (stdout) and advances the internal file position indicator to the next position. * It is equivalent to putc(character,stdout).*/ putchar('\n'); /* free-up the candidate list */ free(candidates); free(allincluster); block_enrichment(fw, bb, block_id); return report_blocks(fw, bb, block_id); }
static void block_init(Edge *e, Block *b, struct dyStack *genes, struct dyStack *scores, bool *candidates, const int cand_threshold, int *components, struct dyStack *allincluster) { int i,j=0,score,top; int cnt = 0, cnt_all=0, col_num=0; continuous cnt_ave=0, row_all = rows; double pvalue=0; int max_cnt, max_i; int *arr_rows, *arr_rows_b; AllocArray(arr_rows, rows); AllocArray(arr_rows_b, rows); bool *colcand; AllocArray(colcand, cols); for (i=0; i< cols; i++) colcand[i] = FALSE; discrete *g1, *g2; discrete *sub_array, col_array[2], col_all[rows]; continuous KL_score[cols],KL_score_c=0, KL_score_r=0; g1 = arr_c[dsItem(genes,0)]; g2 = arr_c[dsItem(genes,1)]; /*update intial colcand*/ for (i=0; i< cols; i++) { if ((g1[i] == g2[i])&&(symbols[g1[i]]!=0)) { colcand[i] = TRUE; for (j=0;j<rows;j++) col_all[j]=arr_c[j][i]; col_array[0]=col_array[1]=symbols[g1[i]]; KL_score[i] = get_KL (col_array, col_all, 2, rows); KL_score_c += KL_score[i]; col_num++; } } KL_score_c = KL_score_c/col_num; for (i = 0; i < rows; i++) { arr_rows[i] = intersect_row(colcand, arr_c[dsItem(genes,0)], arr_c[i]); arr_rows_b[i] = arr_rows[i]; } /*we just get the largest 100 rows when we initial a bicluster because we believe that * the 100 rows can characterize the structure of the bicluster * btw, it can reduce the time complexity*/ if (rows > 100) { qsort(arr_rows_b, rows, sizeof *arr_rows, compare_int); top = arr_rows_b[rows -100]; for (i = 0; i < rows; i++) if (arr_rows[i] < top) candidates[i] = FALSE; } /*calculate the condition low bound for current seed*/ int cutoff = floor (0.05*rows); b->cond_low_bound = arr_rows_b[rows-cutoff]; while (*components < rows) { max_cnt = -1; max_i = -1; (*components)++; cnt_all =0; cnt_ave = 0; /******************************************************/ /*add a function of controling the bicluster by pvalue*/ /******************************************************/ for (i=0; i< rows; i++) { if (!candidates[i]) continue; if (po->IS_list && !sublist[i]) continue; cnt = intersect_row(colcand,arr_c[dsItem(genes,0)],arr_c[i]); cnt_all += cnt; if (cnt < cand_threshold) candidates[i] = FALSE; if (cnt > max_cnt) { max_cnt = cnt; max_i = i; } } cnt_ave = cnt_all/row_all; /*pvalue = get_pvalue (cnt_ave, max_cnt);*/ /* reconsider the genes with cnt=max_cnt when expand current bicluster base on the cwm-like significant of each row */ if (max_cnt>0) { KL_score_r = 0; /*KL_score_max = 0; for (i=0; i< rows; i++) { if (!candidates[i]) continue; if (po->IS_list && !sublist[i]) continue; cnt = intersect_row(colcand,arr_c[dsItem(genes,0)],arr_c[i]); if (cnt == max_cnt) { sub_array = get_intersect_row(colcand,arr_c[dsItem(genes,0)],arr_c[i],cnt); KL_score = get_KL (sub_array, arr_c[i], cnt, cols); if (KL_score > KL_score_max) { max_cnt = cnt; max_i = i; } } }*/ sub_array = get_intersect_row(colcand,arr_c[dsItem(genes,0)],arr_c[max_i],max_cnt); KL_score_r = get_KL (sub_array, arr_c[max_i], max_cnt, cols); for (j=0;j<*components-1;j++) KL_score_r += get_KL (sub_array, arr_c[dsItem(genes,j)], max_cnt, cols); KL_score_r = KL_score_r/(*components); for (j=0;j<cols;j++) { if (colcand[j] && (arr_c[max_i][j] != arr_c[dsItem(genes,0)][j])) { KL_score_c += (KL_score_c-KL_score[j])/(col_num-1); col_num--; } } pvalue = max_cnt/cnt_ave; } /*printf ("%d\t%.2f\t%.2f\t%.2f\n",max_cnt, KL_score_r, KL_score_c, pvalue);*/ if (po->IS_cond) { if (max_cnt < po->COL_WIDTH || max_i < 0 || max_cnt < b->cond_low_bound) break; } else { if (max_cnt < po->COL_WIDTH || max_i < 0) break; } /*printf ("%d\t%d\n",*components, max_cnt);*/ if (po->IS_area) score = *components*max_cnt; else score = floor(100*(KL_score_r+KL_score_c)); /* score = floor(100*KL_score_r*pvalue);*/ /* score = MIN(*components, max_cnt);*/ if (score > b->score) { b->score = score; b->significance = KL_score_r; } dsPush(genes, max_i); dsPush(scores,score); update_colcand(colcand,arr_c[dsItem(genes,0)], arr_c[max_i]); candidates[max_i] = FALSE; } /*be sure to free a pointer when you finish using it*/ free(colcand); }
/* Core algorithm */ int cluster (FILE *fw, Edge **el, int n) { int block_id = 0; Block **bb; int allocated = po->SCH_BLOCK; AllocArray(bb, allocated); Edge *e; Block *b; struct dyStack *genes, *scores, *b_genes, *allincluster; int i, j, k, components; AllocArray(profile, cols); for (j = 0; j < cols; j++) AllocArray(profile[j], sigma); genes = dsNew(rows); scores = dsNew(rows); allincluster = dsNew(rows); bool *candidates; AllocArray(candidates, rows); e = *el; i = 0; while (i++ < n) { e = *el++; /*printf("a:%d b:%d score:%d\n",e->gene_one,e->gene_two,e->score);*/ /* check if both genes already enumerated in previous blocks */ bool flag = TRUE; /* speed up the program if the rows bigger than 200 */ if (rows > 200) { if ( isInStack(allincluster,e->gene_one) && isInStack(allincluster,e->gene_two) ) flag = FALSE; } else { flag = check_seed(e, bb, block_id); } if (!flag) continue; for (j = 0; j < cols; j++) for (k = 0; k < sigma; k++) profile[j][k] = 0; AllocVar(b); b->score = MIN(2, e->score); /* initialize the stacks genes and scores */ int ii; dsClear(genes); dsClear(scores); for(ii = 0; ii < rows; ii ++) { dsPush(genes,-1); dsPush(scores,-1); } dsClear(genes); dsClear(scores); dsPush(genes, e->gene_one); dsPush(genes, e->gene_two); dsPush(scores, 1); dsPush(scores, b->score); /* branch-and-cut condition for seed expansion */ int cand_threshold = floor(po->COL_WIDTH * po->TOLERANCE); if (cand_threshold < 2) cand_threshold = 2; /* maintain a candidate list to avoid looping through all rows */ for (j = 0; j < rows; j++) candidates[j] = TRUE; candidates[e->gene_one] = candidates[e->gene_two] = FALSE; components = 2; /* expansion step, generate a bicluster without noise */ block_init(e, b, genes, scores, candidates, cand_threshold, &components, allincluster); /* track back to find the best score that which genes makes it */ for(k = 0; k < components; k++) if ((dsItem(scores,k) == b->score)&&(dsItem(scores,k+1)!= b->score)) break; components = k + 1; int ki; for (ki=0; ki < rows; ki++) candidates[ki] = TRUE; for (ki=0; ki < components - 1 ; ki++) { seed_update(arr_c[dsItem(genes,ki)]); candidates[dsItem(genes,ki)] = FALSE; } candidates[dsItem(genes,k)] = FALSE; genes->top = k ; int cnt = 0; bool *colcand; AllocArray(colcand, cols); for(ki = 0; ki < cols; ki++) colcand[ki] = FALSE; /* add columns satisfy the conservative r */ seed_current_modify(arr_c[dsItem(genes,k)], colcand, &cnt, components); /* add some new possible genes */ int m_cnt; for ( ki = 0; ki < rows; ki++) { m_cnt = intersect_row(colcand, arr_c[dsItem(genes,0)], arr_c[ki]); if ( candidates[ki] && (m_cnt >= floor(cnt* po->TOLERANCE)) ) { dsPush(genes,ki); components++; candidates[ki] = FALSE; } } b->block_rows_pre = components; /* add genes that negative regulated to the consensus */ for ( ki = 0; ki < rows; ki++) { m_cnt = reverse_row(colcand, arr_c[dsItem(genes,0)], arr_c[ki]); if ( candidates[ki] && (m_cnt >= floor(cnt * po->TOLERANCE)) ) { dsPush(genes,ki); components++; candidates[ki] = FALSE; } } free(colcand); /* save the current cluster*/ b_genes = dsNew(b->block_rows_pre); for (ki = 0; ki < b->block_rows_pre; ki++) dsPush(b_genes, dsItem(genes,ki)); /* store gene arrays inside block */ b->genes = dsNew(components); b->conds = dsNew(cols); scan_block(b_genes, b); if (b->block_cols == 0) continue; b->block_rows = components; b->score = b->block_rows * b->block_cols; dsClear(b->genes); for ( ki=0; ki < components; ki++) dsPush(b->genes,dsItem(genes,ki)); for(ki = 0; ki < components; ki++) if(!isInStack(allincluster, dsItem(genes,ki))) dsPush(allincluster,dsItem(genes,ki)); bb[block_id++] = b; /* reaching the results number limit */ if (block_id == po->SCH_BLOCK) break; verboseDot(); } putchar('\n'); /* free-up the candidate list */ free(candidates); free(allincluster); return report_blocks(fw, bb, block_id); }
static void block_init(Edge *e, Block *b, struct dyStack *genes, struct dyStack *scores, bool *candidates, const int cand_threshold, int *components, struct dyStack *allincluster) { int i,score,top; int cnt = 0; int max_cnt, max_i; int *arr_rows, *arr_rows_b; AllocArray(arr_rows, rows); AllocArray(arr_rows_b, rows); bool *colcand; AllocArray(colcand, cols); for (i=0; i< cols; i++) colcand[i] = FALSE; discrete *g1, *g2; g1 = arr_c[dsItem(genes,0)]; g2 = arr_c[dsItem(genes,1)]; for (i=0; i< cols; i++) if ((g1[i] == g2[i])&&(g1[i]!=0)) colcand[i] = TRUE; for (i = 0; i < rows; i++) { arr_rows[i] = intersect_row(colcand, arr_c[dsItem(genes,0)], arr_c[i]); arr_rows_b[i] = arr_rows[i]; } if (rows > 100) { qsort(arr_rows_b, rows, sizeof *arr_rows, compare_int); top = arr_rows_b[rows -100]; for (i = 0; i < rows; i++) if (arr_rows[i] < top) candidates[i] = FALSE; } while (*components < rows) { max_cnt = -1; max_i = -1; (*components)++; for (i=0; i< rows; i++) { if (!candidates[i]) continue; cnt = intersect_row(colcand,arr_c[dsItem(genes,0)],arr_c[i]); if (cnt < cand_threshold) candidates[i] = FALSE; if (cnt > max_cnt) { max_cnt = cnt; max_i = i; } } if (max_cnt < po->COL_WIDTH || max_i < 0) break; else { score = MIN(*components, max_cnt); if (score > b->score) b->score = score; dsPush(genes, max_i); dsPush(scores,score); update_colcand(colcand,arr_c[dsItem(genes,0)], arr_c[max_i]); candidates[max_i] = FALSE; } } free(colcand); }
} /* Read the .block file, get components and colcand */ void read_and_solve_blocks(FILE *fb, const char *fn) { init_expand(); size_t n; int col; char *line = NULL; int bnumber = 0; struct dyStack *ge, *co; int i, components, m_cnt; bool *colcand; bool *candidates; Block *b; AllocVar(b); AllocArray(colcand, another_cols); AllocArray(candidates, another_rows); ge = dsNew(another_rows); co = dsNew(another_cols); FILE *fo = mustOpen(fn, "w"); /* main course starts here */ while (getline(&line, &n, fb) != -1) { /* fast forward to a line that contains BC*/ /* strncmp compares up to num characters of the C string str1 to those of the C string str2 * strncmp ( const char * str1, const char * str2, size_t num )*/ while (strncmp(line, "BC", 2)!=0) { if (getline(&line, &n, fb)==-1) exit(0); } components = 0; col = 0; dsClear(ge); dsClear(co); for (i=0; i< another_cols; i++) colcand[i] = FALSE; for (i=0; i< another_rows; i++) candidates[i] = TRUE; /* read genes from block */ getline(&line, &n, fb); atom = strtok(line, delims); atom = strtok(NULL, delims); while((atom = strtok(NULL, delims)) != NULL) { /* look up for genes number */ if (strlen(atom) == 0) continue; for(i=0; i<another_rows; i++) { if (strcmp(atom ,another_genes[i]) == 0) break; } candidates[i] = FALSE; dsPush(ge, i); components++; } /* read conditions from block */ getline(&line, &n, fb); atom = strtok(line, delims); atom = strtok(NULL, delims); while((atom = strtok(NULL, delims)) != NULL) { /*if (strlen(atom) < 5) break;*/ if (strlen(atom) == 0) continue; for(i=0; i<another_cols; i++) if (strcmp(atom, another_conds[i]) == 0) break; colcand[i] = TRUE; dsPush(co, i); col++; } b->block_rows_pre = components; /* add some possible genes */ for( i = 0; i < another_rows; i++) { m_cnt = intersect_row(colcand, another_arr_c[dsItem(ge,0)], another_arr_c[i], another_cols); printf ("%d\n",m_cnt); if( candidates[i] && (m_cnt >= (int)floor( (double)col * po->TOLERANCE)) ) { dsPush(ge,i); components++; candidates[i] = FALSE; } } /* add genes that negative regulated to the consensus */ for( i = 0; i < another_rows; i++) { m_cnt = reverse_row(colcand, another_arr_c[dsItem(ge,0)], another_arr_c[i], another_cols); if( candidates[i] && (m_cnt >= (int)floor( (double)col * po->TOLERANCE)) ) { dsPush(ge,i); components++; candidates[i] = FALSE; } } if(dsSize(ge) > 1) { store_block(b, ge, co); /*another_print_bc(fo, b, bnumber);*/ print_bc(fo, b, bnumber++); }