/* Core algorithm */ int cluster (FILE *fw, Edge **el, int n) { int block_id = 0; Block **bb; int allocated = po->SCH_BLOCK; AllocArray(bb, allocated); Edge *e; Block *b; struct dyStack *genes, *scores, *b_genes, *allincluster; int i, j, k, components; AllocArray(profile, cols); for (j = 0; j < cols; j++) AllocArray(profile[j], sigma); genes = dsNew(rows); scores = dsNew(rows); allincluster = dsNew(rows); bool *candidates; AllocArray(candidates, rows); e = *el; i = 0; while (i++ < n) { e = *el++; /*printf("a:%d b:%d score:%d\n",e->gene_one,e->gene_two,e->score);*/ /* check if both genes already enumerated in previous blocks */ bool flag = TRUE; /* speed up the program if the rows bigger than 200 */ if (rows > 200) { if ( isInStack(allincluster,e->gene_one) && isInStack(allincluster,e->gene_two) ) flag = FALSE; } else { flag = check_seed(e, bb, block_id); } if (!flag) continue; for (j = 0; j < cols; j++) for (k = 0; k < sigma; k++) profile[j][k] = 0; AllocVar(b); b->score = MIN(2, e->score); /* initialize the stacks genes and scores */ int ii; dsClear(genes); dsClear(scores); for(ii = 0; ii < rows; ii ++) { dsPush(genes,-1); dsPush(scores,-1); } dsClear(genes); dsClear(scores); dsPush(genes, e->gene_one); dsPush(genes, e->gene_two); dsPush(scores, 1); dsPush(scores, b->score); /* branch-and-cut condition for seed expansion */ int cand_threshold = floor(po->COL_WIDTH * po->TOLERANCE); if (cand_threshold < 2) cand_threshold = 2; /* maintain a candidate list to avoid looping through all rows */ for (j = 0; j < rows; j++) candidates[j] = TRUE; candidates[e->gene_one] = candidates[e->gene_two] = FALSE; components = 2; /* expansion step, generate a bicluster without noise */ block_init(e, b, genes, scores, candidates, cand_threshold, &components, allincluster); /* track back to find the best score that which genes makes it */ for(k = 0; k < components; k++) if ((dsItem(scores,k) == b->score)&&(dsItem(scores,k+1)!= b->score)) break; components = k + 1; int ki; for (ki=0; ki < rows; ki++) candidates[ki] = TRUE; for (ki=0; ki < components - 1 ; ki++) { seed_update(arr_c[dsItem(genes,ki)]); candidates[dsItem(genes,ki)] = FALSE; } candidates[dsItem(genes,k)] = FALSE; genes->top = k ; int cnt = 0; bool *colcand; AllocArray(colcand, cols); for(ki = 0; ki < cols; ki++) colcand[ki] = FALSE; /* add columns satisfy the conservative r */ seed_current_modify(arr_c[dsItem(genes,k)], colcand, &cnt, components); /* add some new possible genes */ int m_cnt; for ( ki = 0; ki < rows; ki++) { m_cnt = intersect_row(colcand, arr_c[dsItem(genes,0)], arr_c[ki]); if ( candidates[ki] && (m_cnt >= floor(cnt* po->TOLERANCE)) ) { dsPush(genes,ki); components++; candidates[ki] = FALSE; } } b->block_rows_pre = components; /* add genes that negative regulated to the consensus */ for ( ki = 0; ki < rows; ki++) { m_cnt = reverse_row(colcand, arr_c[dsItem(genes,0)], arr_c[ki]); if ( candidates[ki] && (m_cnt >= floor(cnt * po->TOLERANCE)) ) { dsPush(genes,ki); components++; candidates[ki] = FALSE; } } free(colcand); /* save the current cluster*/ b_genes = dsNew(b->block_rows_pre); for (ki = 0; ki < b->block_rows_pre; ki++) dsPush(b_genes, dsItem(genes,ki)); /* store gene arrays inside block */ b->genes = dsNew(components); b->conds = dsNew(cols); scan_block(b_genes, b); if (b->block_cols == 0) continue; b->block_rows = components; b->score = b->block_rows * b->block_cols; dsClear(b->genes); for ( ki=0; ki < components; ki++) dsPush(b->genes,dsItem(genes,ki)); for(ki = 0; ki < components; ki++) if(!isInStack(allincluster, dsItem(genes,ki))) dsPush(allincluster,dsItem(genes,ki)); bb[block_id++] = b; /* reaching the results number limit */ if (block_id == po->SCH_BLOCK) break; verboseDot(); } putchar('\n'); /* free-up the candidate list */ free(candidates); free(allincluster); return report_blocks(fw, bb, block_id); }
/* Core algorithm */ int cluster (FILE *fw, Edge **el, int n) { int block_id = 0; Block **bb; int allocated = po->SCH_BLOCK; AllocArray(bb, allocated); Edge *e; Block *b; struct dyStack *genes, *scores, *b_genes, *allincluster; int i, j, k, components; AllocArray(profile, cols); for (j = 0; j < cols; j++) AllocArray(profile[j], sigma); genes = dsNew(rows); scores = dsNew(rows); allincluster = dsNew(rows); bool *candidates; AllocArray(candidates, rows); e = *el; i = 0; while (i++ < n) { /*printf ("%d\n",i);*/ e = *el++; /* check if both genes already enumerated in previous blocks */ bool flag = TRUE; /* speed up the program if the rows bigger than 200 */ if (rows > 250) { if ( isInStack(allincluster,e->gene_one) && isInStack(allincluster,e->gene_two) ) flag = FALSE; else if ((po->IS_TFname)&&(e->gene_one!= TFindex)&&(e->gene_two!=TFindex)) flag = FALSE; else if ((po->IS_list)&&(!sublist[e->gene_one] || !sublist[e->gene_two])) flag =FALSE; } else { flag = check_seed(e, bb, block_id); if ((po->IS_TFname)&&(e->gene_one!= TFindex)&&(e->gene_two!=TFindex)) flag = FALSE; if ((po->IS_list)&&(!sublist[e->gene_one] || !sublist[e->gene_two])) flag = FALSE; } if (!flag) continue; for (j = 0; j < cols; j++) for (k = 0; k < sigma; k++) profile[j][k] = 0; /*you must allocate a struct if you want to use the pointers related to it*/ AllocVar(b); /*initial the b->score*/ b->score = MIN(2, e->score); /* initialize the stacks genes and scores */ int ii; dsClear(genes); dsClear(scores); for(ii = 0; ii < rows; ii ++) { dsPush(genes,-1); dsPush(scores,-1); } dsClear(genes); dsClear(scores); /*printf ("%d\t%d\n",e->gene_one,e->gene_two);*/ dsPush(genes, e->gene_one); dsPush(genes, e->gene_two); dsPush(scores, 1); dsPush(scores, b->score); /* branch-and-cut condition for seed expansion */ int cand_threshold = floor(po->COL_WIDTH * po->TOLERANCE); if (cand_threshold < 2) cand_threshold = 2; /* maintain a candidate list to avoid looping through all rows */ for (j = 0; j < rows; j++) candidates[j] = TRUE; candidates[e->gene_one] = candidates[e->gene_two] = FALSE; components = 2; /* expansion step, generate a bicluster without noise */ block_init(e, b, genes, scores, candidates, cand_threshold, &components, allincluster); /* track back to find the genes by which we get the best score*/ for(k = 0; k < components; k++) { /* printf ("******%d\t%d\n",dsItem(scores,k),b->score);*/ if ((dsItem(scores,k) == b->score)&&(dsItem(scores,k+1)!= b->score)) break; } components = k + 1; /*printf ("%d",components);*/ int ki; for (ki=0; ki < rows; ki++) candidates[ki] = TRUE; for (ki=0; ki < components - 1 ; ki++) { seed_update(arr_c[dsItem(genes,ki)]); candidates[dsItem(genes,ki)] = FALSE; } candidates[dsItem(genes,k)] = FALSE; genes->top = k ; int cnt = 0; bool *colcand; AllocArray(colcand, cols); for(ki = 0; ki < cols; ki++) colcand[ki] = FALSE; /* add columns satisfy the conservative r */ seed_current_modify(arr_c[dsItem(genes,k)], colcand, &cnt, components); /* add some new possible genes */ int m_cnt=0; continuous KL_score=0; discrete *sub_array; for ( ki = 0; ki < rows; ki++) { if (po->IS_list && !sublist[ki]) continue; m_cnt = intersect_row(colcand, arr_c[dsItem(genes,0)], arr_c[ki]); if ( candidates[ki] && (m_cnt >= floor(cnt* po->TOLERANCE)) ) { sub_array = get_intersect_row(colcand,arr_c[dsItem(genes,0)],arr_c[ki],m_cnt); KL_score = get_KL (sub_array, arr_c[ki], m_cnt, cols); /*printf ("%d\t%.2f\n",m_cnt,KL_score);*/ if (KL_score>=b->significance * po->TOLERANCE) { dsPush(genes,ki); components++; candidates[ki] = FALSE; } } } b->block_rows_pre = components; /* add genes that negative regulated to the consensus */ for ( ki = 0; ki < rows; ki++) { if (po->IS_list && !sublist[ki]) continue; m_cnt = reverse_row(colcand, arr_c[dsItem(genes,0)], arr_c[ki]); if ( candidates[ki] && (m_cnt >= floor(cnt * po->TOLERANCE)) ) { sub_array = get_intersect_reverse_row(colcand,arr_c[dsItem(genes,0)],arr_c[ki],m_cnt); KL_score = get_KL (sub_array, arr_c[ki], m_cnt, cols); if (KL_score>=b->significance * po->TOLERANCE) { dsPush(genes,ki); components++; candidates[ki] = FALSE; } } } free(colcand); /* save the current cluster*/ b_genes = dsNew(b->block_rows_pre); for (ki = 0; ki < b->block_rows_pre; ki++) dsPush(b_genes, dsItem(genes,ki)); /* store gene arrays inside block */ b->genes = dsNew(components); b->conds = dsNew(cols); scan_block(b_genes, b); if (b->block_cols == 0) continue; b->block_rows = components; b->score = b->score; /* b->score = b->block_rows * b->block_cols; */ dsClear(b->genes); for ( ki=0; ki < components; ki++) dsPush(b->genes,dsItem(genes,ki)); for(ki = 0; ki < components; ki++) if(!isInStack(allincluster, dsItem(genes,ki))) dsPush(allincluster,dsItem(genes,ki)); /*save the current block b to the block list bb so that we can sort the blocks by their score*/ bb[block_id++] = b; /* reaching the results number limit */ if (block_id == po->SCH_BLOCK) break; verboseDot(); } /* writes character to the current position in the standard output (stdout) and advances the internal file position indicator to the next position. * It is equivalent to putc(character,stdout).*/ putchar('\n'); /* free-up the candidate list */ free(candidates); free(allincluster); block_enrichment(fw, bb, block_id); return report_blocks(fw, bb, block_id); }
} /* Read the .block file, get components and colcand */ void read_and_solve_blocks(FILE *fb, const char *fn) { init_expand(); size_t n; int col; char *line = NULL; int bnumber = 0; struct dyStack *ge, *co; int i, components, m_cnt; bool *colcand; bool *candidates; Block *b; AllocVar(b); AllocArray(colcand, another_cols); AllocArray(candidates, another_rows); ge = dsNew(another_rows); co = dsNew(another_cols); FILE *fo = mustOpen(fn, "w"); /* main course starts here */ while (getline(&line, &n, fb) != -1) { /* fast forward to a line that contains BC*/ /* strncmp compares up to num characters of the C string str1 to those of the C string str2 * strncmp ( const char * str1, const char * str2, size_t num )*/ while (strncmp(line, "BC", 2)!=0) { if (getline(&line, &n, fb)==-1) exit(0); } components = 0; col = 0; dsClear(ge); dsClear(co); for (i=0; i< another_cols; i++) colcand[i] = FALSE; for (i=0; i< another_rows; i++) candidates[i] = TRUE; /* read genes from block */ getline(&line, &n, fb); atom = strtok(line, delims); atom = strtok(NULL, delims); while((atom = strtok(NULL, delims)) != NULL) { /* look up for genes number */ if (strlen(atom) == 0) continue; for(i=0; i<another_rows; i++) { if (strcmp(atom ,another_genes[i]) == 0) break; } candidates[i] = FALSE; dsPush(ge, i); components++; } /* read conditions from block */ getline(&line, &n, fb); atom = strtok(line, delims); atom = strtok(NULL, delims); while((atom = strtok(NULL, delims)) != NULL) { /*if (strlen(atom) < 5) break;*/ if (strlen(atom) == 0) continue; for(i=0; i<another_cols; i++) if (strcmp(atom, another_conds[i]) == 0) break; colcand[i] = TRUE; dsPush(co, i); col++; } b->block_rows_pre = components; /* add some possible genes */ for( i = 0; i < another_rows; i++) { m_cnt = intersect_row(colcand, another_arr_c[dsItem(ge,0)], another_arr_c[i], another_cols); printf ("%d\n",m_cnt); if( candidates[i] && (m_cnt >= (int)floor( (double)col * po->TOLERANCE)) ) { dsPush(ge,i); components++; candidates[i] = FALSE; } } /* add genes that negative regulated to the consensus */ for( i = 0; i < another_rows; i++) { m_cnt = reverse_row(colcand, another_arr_c[dsItem(ge,0)], another_arr_c[i], another_cols); if( candidates[i] && (m_cnt >= (int)floor( (double)col * po->TOLERANCE)) ) { dsPush(ge,i); components++; candidates[i] = FALSE; } } if(dsSize(ge) > 1) { store_block(b, ge, co); /*another_print_bc(fo, b, bnumber);*/ print_bc(fo, b, bnumber++); }