Ejemplo n.º 1
0
/* scan through all columns and identify the set within threshold,
 * "fuzziness" of the block is controlled by TOLERANCE (-c)
 */
void scan_block (struct dyStack *gene_set, Block *b_ptr)
{
	int i, j;
	int block_rows, cur_rows;
	block_rows = cur_rows = dsSize(gene_set);
	
	int k;
	for (j = 0; j < cols; j++)
		for (k=0; k<sigma; k++) 
			profile[j][k] = 0;
	for (j = 0; j< cur_rows ; j++)
		seed_update(arr_c[dsItem(gene_set,j)]);

	int btolerance = ceil(po->TOLERANCE* block_rows);
	for (j = 0; j < cols; j++)
	{
		/* See if this column satisfies tolerance */
		/* here i start from 1 because symbols[0]=0 */
		for (i = 1; i < sigma; i++)
		{
			if ((profile[j][i] >= btolerance))
			{
				dsPush(b_ptr->conds, j); break;
			}
		}		
	}
	b_ptr->block_cols = dsSize(b_ptr->conds);
}
Ejemplo n.º 2
0
/* Core algorithm */
int cluster (FILE *fw, Edge **el, int n)
{
	int block_id = 0;
	Block **bb;
	int allocated = po->SCH_BLOCK;
	AllocArray(bb, allocated);

	Edge *e;
	Block *b;
	struct dyStack *genes, *scores, *b_genes, *allincluster;
	
	int i, j, k, components;
	AllocArray(profile, cols);
	for (j = 0; j < cols; j++) 
		AllocArray(profile[j], sigma);

	genes = dsNew(rows);
	scores = dsNew(rows);
	allincluster = dsNew(rows);

    

	bool *candidates;
	AllocArray(candidates, rows);

	e = *el; 
	i = 0;
	while (i++ < n)
	{	
		/*printf ("%d\n",i);*/
		e = *el++;
		/* check if both genes already enumerated in previous blocks */
		bool flag = TRUE;
		/* speed up the program if the rows bigger than 200 */
	        if (rows > 250)
		{ 
			if ( isInStack(allincluster,e->gene_one) && isInStack(allincluster,e->gene_two) )
				flag = FALSE;
			else if ((po->IS_TFname)&&(e->gene_one!= TFindex)&&(e->gene_two!=TFindex))
				flag = FALSE;
			else if ((po->IS_list)&&(!sublist[e->gene_one] || !sublist[e->gene_two]))
				flag =FALSE;
		}
		else   
		{
			flag = check_seed(e, bb, block_id);
			if ((po->IS_TFname)&&(e->gene_one!= TFindex)&&(e->gene_two!=TFindex))
				flag = FALSE;
			if ((po->IS_list)&&(!sublist[e->gene_one] || !sublist[e->gene_two]))
				flag = FALSE;
		}
		if (!flag) continue;

		for (j = 0; j < cols; j++)
			for (k = 0; k < sigma; k++) 
				profile[j][k] = 0;

		/*you must allocate a struct if you want to use the pointers related to it*/
		AllocVar(b);
		/*initial the b->score*/
                b->score = MIN(2, e->score);
	
		/* initialize the stacks genes and scores */		
		int ii;		
		dsClear(genes);
		dsClear(scores);		
		for(ii = 0; ii < rows; ii ++)
		{
			dsPush(genes,-1);
			dsPush(scores,-1);
		}		
		dsClear(genes);
		dsClear(scores);
		
		/*printf ("%d\t%d\n",e->gene_one,e->gene_two);*/
		dsPush(genes, e->gene_one);
		dsPush(genes, e->gene_two);
		dsPush(scores, 1);
		dsPush(scores, b->score);

		/* branch-and-cut condition for seed expansion */
		int cand_threshold = floor(po->COL_WIDTH * po->TOLERANCE);
                if (cand_threshold < 2) 
			cand_threshold = 2;

		/* maintain a candidate list to avoid looping through all rows */		
		for (j = 0; j < rows; j++) 
			candidates[j] = TRUE;
		candidates[e->gene_one] = candidates[e->gene_two] = FALSE;
		components = 2;

		/* expansion step, generate a bicluster without noise */
		block_init(e, b, genes, scores, candidates, cand_threshold, &components, allincluster);

		/* track back to find the genes by which we get the best score*/
		for(k = 0; k < components; k++)
		{
/*			printf ("******%d\t%d\n",dsItem(scores,k),b->score);*/
			if ((dsItem(scores,k) == b->score)&&(dsItem(scores,k+1)!= b->score)) break;
		}
		components = k + 1;
		/*printf ("%d",components);*/
		int ki;
		for (ki=0; ki < rows; ki++)
			candidates[ki] = TRUE;

		for (ki=0; ki < components - 1 ; ki++)
		{
			seed_update(arr_c[dsItem(genes,ki)]);
			candidates[dsItem(genes,ki)] = FALSE;
		}
		candidates[dsItem(genes,k)] = FALSE;
		genes->top = k ;
		int cnt = 0;
		bool *colcand;
		AllocArray(colcand, cols);
		for(ki = 0; ki < cols; ki++) 
			colcand[ki] = FALSE;             
    
		/* add columns satisfy the conservative r */ 
		seed_current_modify(arr_c[dsItem(genes,k)], colcand, &cnt, components);
		
		/* add some new possible genes */
		int m_cnt=0;
		continuous KL_score=0;
		discrete *sub_array;
		for ( ki = 0; ki < rows; ki++)
		{
			if (po->IS_list && !sublist[ki]) continue;
			m_cnt = intersect_row(colcand, arr_c[dsItem(genes,0)], arr_c[ki]);
			if ( candidates[ki] && (m_cnt >= floor(cnt* po->TOLERANCE)) )
			{
				sub_array = get_intersect_row(colcand,arr_c[dsItem(genes,0)],arr_c[ki],m_cnt);
				KL_score = get_KL (sub_array, arr_c[ki], m_cnt, cols);
				/*printf ("%d\t%.2f\n",m_cnt,KL_score);*/
				if (KL_score>=b->significance * po->TOLERANCE)
				{
					dsPush(genes,ki);
					components++;
					candidates[ki] = FALSE;
				}
			}
		}

                b->block_rows_pre = components;
		
		/* add genes that negative regulated to the consensus */
		for ( ki = 0; ki < rows; ki++)
		{
			if (po->IS_list && !sublist[ki]) continue;
			m_cnt = reverse_row(colcand, arr_c[dsItem(genes,0)], arr_c[ki]);
			if ( candidates[ki] && (m_cnt >= floor(cnt * po->TOLERANCE)) )
			{
				sub_array = get_intersect_reverse_row(colcand,arr_c[dsItem(genes,0)],arr_c[ki],m_cnt);
				KL_score = get_KL (sub_array, arr_c[ki], m_cnt, cols);
				if (KL_score>=b->significance * po->TOLERANCE)
				{
					dsPush(genes,ki);
					components++;
					candidates[ki] = FALSE;
				}
			}
		}
		free(colcand);

		/* save the current cluster*/
		b_genes = dsNew(b->block_rows_pre);
		for (ki = 0; ki < b->block_rows_pre; ki++)
			dsPush(b_genes, dsItem(genes,ki));

		/* store gene arrays inside block */
		b->genes = dsNew(components);
		b->conds = dsNew(cols);
	
		scan_block(b_genes, b);
		if (b->block_cols == 0) continue;
		b->block_rows = components;

		b->score = b->score;
		/*	b->score = b->block_rows * b->block_cols;		*/

		dsClear(b->genes);
		for ( ki=0; ki < components; ki++)
			dsPush(b->genes,dsItem(genes,ki));
		for(ki = 0; ki < components; ki++)
			if(!isInStack(allincluster, dsItem(genes,ki))) 
				dsPush(allincluster,dsItem(genes,ki));	
		/*save the current block b to the block list bb so that we can sort the blocks by their score*/
		bb[block_id++] = b;

		/* reaching the results number limit */
		if (block_id == po->SCH_BLOCK) break;
		verboseDot();	
	}
	/* writes character to the current position in the standard output (stdout) and advances the internal file position indicator to the next position.
	 * It is equivalent to putc(character,stdout).*/
	putchar('\n');
	/* free-up the candidate list */
	free(candidates);
	free(allincluster);
	block_enrichment(fw, bb, block_id);
	return report_blocks(fw, bb, block_id);
}
Ejemplo n.º 3
0
static void block_init(Edge *e, Block *b, 
                     struct dyStack *genes, struct dyStack *scores,
                     bool *candidates, const int cand_threshold,
                     int *components, struct dyStack *allincluster)
{
	int i,j=0,score,top;
	int cnt = 0, cnt_all=0, col_num=0;
	continuous cnt_ave=0, row_all = rows;
	double pvalue=0;
	int max_cnt, max_i;
	int *arr_rows, *arr_rows_b;
	AllocArray(arr_rows, rows);
	AllocArray(arr_rows_b, rows);	
	bool *colcand;
	AllocArray(colcand, cols);
	for (i=0; i< cols; i++) 
		colcand[i] = FALSE;
	discrete *g1, *g2;
	discrete *sub_array, col_array[2], col_all[rows];
	continuous KL_score[cols],KL_score_c=0, KL_score_r=0;
	g1 = arr_c[dsItem(genes,0)];
	g2 = arr_c[dsItem(genes,1)];

	/*update intial colcand*/
	for (i=0; i< cols; i++)
	{
		if ((g1[i] == g2[i])&&(symbols[g1[i]]!=0)) 
		{
			colcand[i] = TRUE;
			for (j=0;j<rows;j++)
				col_all[j]=arr_c[j][i];
			col_array[0]=col_array[1]=symbols[g1[i]];
	                KL_score[i] = get_KL (col_array, col_all, 2, rows);
			KL_score_c += KL_score[i];
			col_num++;
		}
	}
	KL_score_c = KL_score_c/col_num;
			

	for (i = 0; i < rows; i++)
	{
		arr_rows[i] = intersect_row(colcand, arr_c[dsItem(genes,0)], arr_c[i]);
		arr_rows_b[i] = arr_rows[i];
	}
	/*we just get the largest 100 rows when we initial a bicluster because we believe that 
	 * the 100 rows can characterize the structure of the bicluster 
	 * btw, it can reduce the time complexity*/
	if (rows > 100)
	{		
		qsort(arr_rows_b, rows, sizeof *arr_rows, compare_int);
		top = arr_rows_b[rows -100];
		for (i = 0; i < rows; i++)
			if (arr_rows[i] < top) 
				candidates[i] = FALSE;
	}

	/*calculate the condition low bound for current seed*/
	int cutoff = floor (0.05*rows);
	b->cond_low_bound = arr_rows_b[rows-cutoff];

	while (*components < rows)
	{
		max_cnt = -1;
		max_i = -1;
		(*components)++;
		cnt_all =0;
		cnt_ave = 0;
		/******************************************************/
		/*add a function of controling the bicluster by pvalue*/
		/******************************************************/
		for (i=0; i< rows; i++)
		{
			if (!candidates[i]) continue;
			if (po->IS_list && !sublist[i]) continue;
			cnt = intersect_row(colcand,arr_c[dsItem(genes,0)],arr_c[i]);
			cnt_all += cnt;
			if (cnt < cand_threshold) 
				candidates[i] = FALSE;
			if (cnt > max_cnt)
			{
				max_cnt = cnt;
				max_i = i;
			}
		}
		cnt_ave = cnt_all/row_all;
		/*pvalue = get_pvalue (cnt_ave, max_cnt);*/

		/* reconsider the genes with cnt=max_cnt when expand current bicluster base on the cwm-like significant of each row */
		if (max_cnt>0)
		{
			KL_score_r = 0;
			/*KL_score_max = 0;
			for (i=0; i< rows; i++)
			{
				if (!candidates[i]) continue;
                	        if (po->IS_list && !sublist[i]) continue;
                        	cnt = intersect_row(colcand,arr_c[dsItem(genes,0)],arr_c[i]);
				if (cnt == max_cnt)
				{
					sub_array = get_intersect_row(colcand,arr_c[dsItem(genes,0)],arr_c[i],cnt);
					KL_score = get_KL (sub_array, arr_c[i], cnt, cols);
					if (KL_score > KL_score_max)
					{
						max_cnt = cnt;
						max_i = i;
					}
				}
			}*/
			sub_array = get_intersect_row(colcand,arr_c[dsItem(genes,0)],arr_c[max_i],max_cnt);
	                KL_score_r = get_KL (sub_array, arr_c[max_i], max_cnt, cols);
                	for (j=0;j<*components-1;j++)
        	                KL_score_r += get_KL (sub_array, arr_c[dsItem(genes,j)], max_cnt, cols);
        	        KL_score_r = KL_score_r/(*components);
			for (j=0;j<cols;j++)
			{
				if (colcand[j] && (arr_c[max_i][j] != arr_c[dsItem(genes,0)][j]))
				{
					KL_score_c += (KL_score_c-KL_score[j])/(col_num-1);
					col_num--;
				}
			}
			pvalue = max_cnt/cnt_ave;
		}
		/*printf ("%d\t%.2f\t%.2f\t%.2f\n",max_cnt, KL_score_r, KL_score_c, pvalue);*/
		if (po->IS_cond)
		{
			if (max_cnt < po->COL_WIDTH || max_i < 0 || max_cnt < b->cond_low_bound) break;
		}
		else
		{
			if (max_cnt < po->COL_WIDTH || max_i < 0) break;
		}

		/*printf ("%d\t%d\n",*components, max_cnt);*/
		if (po->IS_area)
			score = *components*max_cnt;
		else
			score = floor(100*(KL_score_r+KL_score_c));
	/*		score = floor(100*KL_score_r*pvalue);*/
		/*	score = MIN(*components, max_cnt);*/
		if (score > b->score) 
		{
			b->score = score;
			b->significance = KL_score_r;
		}
		dsPush(genes, max_i);
		dsPush(scores,score);
		update_colcand(colcand,arr_c[dsItem(genes,0)], arr_c[max_i]);
		candidates[max_i] = FALSE;
	}
	/*be sure to free a pointer when you finish using it*/
	free(colcand);
}
Ejemplo n.º 4
0
/* Core algorithm */
int cluster (FILE *fw, Edge **el, int n)
{
	int block_id = 0;
	Block **bb;
	int allocated = po->SCH_BLOCK;
	AllocArray(bb, allocated);

	Edge *e;
	Block *b;
	struct dyStack *genes, *scores, *b_genes, *allincluster;
	
	int i, j, k, components;

	AllocArray(profile, cols);
	for (j = 0; j < cols; j++) AllocArray(profile[j], sigma);

	genes = dsNew(rows);
	scores = dsNew(rows);
	allincluster = dsNew(rows);
    
	bool *candidates;
	AllocArray(candidates, rows);

	e = *el; i = 0;
	while (i++ < n)
	{	
		e = *el++;
        /*printf("a:%d b:%d score:%d\n",e->gene_one,e->gene_two,e->score);*/

		/* check if both genes already enumerated in previous blocks */
		bool flag = TRUE;
		/* speed up the program if the rows bigger than 200 */
	        if (rows > 200)
		{ 
			if ( isInStack(allincluster,e->gene_one) && isInStack(allincluster,e->gene_two) )
			flag = FALSE;
		}
		else   
                    {
		     flag = check_seed(e, bb, block_id);
		    }
		if (!flag) continue;

		for (j = 0; j < cols; j++)
			for (k = 0; k < sigma; k++) profile[j][k] = 0;

		AllocVar(b);
                b->score = MIN(2, e->score);
	
		/* initialize the stacks genes and scores */		
		int ii;		
		dsClear(genes);
		dsClear(scores);		
		for(ii = 0; ii < rows; ii ++)
		{
			dsPush(genes,-1);
			dsPush(scores,-1);
		}		
		dsClear(genes);
		dsClear(scores);
		
		dsPush(genes, e->gene_one);
		dsPush(genes, e->gene_two);
		dsPush(scores, 1);
		dsPush(scores, b->score);

		/* branch-and-cut condition for seed expansion */
		int cand_threshold = floor(po->COL_WIDTH * po->TOLERANCE);
                if (cand_threshold < 2) cand_threshold = 2;

		/* maintain a candidate list to avoid looping through all rows */		
		for (j = 0; j < rows; j++) candidates[j] = TRUE;
		candidates[e->gene_one] = candidates[e->gene_two] = FALSE;
		
		components = 2;

		/* expansion step, generate a bicluster without noise */
		block_init(e, b, genes, scores, candidates, cand_threshold, &components, allincluster);

		/* track back to find the best score that which genes makes it */
		for(k = 0; k < components; k++)
			if ((dsItem(scores,k) == b->score)&&(dsItem(scores,k+1)!= b->score)) break;
		components = k + 1;

		int ki;
		for (ki=0; ki < rows; ki++)
		candidates[ki] = TRUE;

		for (ki=0; ki < components - 1 ; ki++)
		{
			seed_update(arr_c[dsItem(genes,ki)]);
			candidates[dsItem(genes,ki)] = FALSE;
		}
		candidates[dsItem(genes,k)] = FALSE;
		genes->top = k ;
		int cnt = 0;
		bool *colcand;
		AllocArray(colcand, cols);
		for(ki = 0; ki < cols; ki++) colcand[ki] = FALSE;             
    
        /* add columns satisfy the conservative r */ 
		seed_current_modify(arr_c[dsItem(genes,k)], colcand, &cnt, components);
		
        /* add some new possible genes */
		int m_cnt;
		for ( ki = 0; ki < rows; ki++)
		{
			m_cnt = intersect_row(colcand, arr_c[dsItem(genes,0)], arr_c[ki]);
			if ( candidates[ki] && (m_cnt >= floor(cnt* po->TOLERANCE)) )
			{
				dsPush(genes,ki);
				components++;
				candidates[ki] = FALSE;
			}
		}
                b->block_rows_pre = components;
		
        /* add genes that negative regulated to the consensus */
		for ( ki = 0; ki < rows; ki++)
		{
			m_cnt = reverse_row(colcand, arr_c[dsItem(genes,0)], arr_c[ki]);
			if ( candidates[ki] && (m_cnt >= floor(cnt * po->TOLERANCE)) )
			{
				dsPush(genes,ki);
				components++;
				candidates[ki] = FALSE;
			}
		}
		free(colcand);

		/* save the current cluster*/
		b_genes = dsNew(b->block_rows_pre);
		for (ki = 0; ki < b->block_rows_pre; ki++)
			dsPush(b_genes, dsItem(genes,ki));

		/* store gene arrays inside block */
		b->genes = dsNew(components);
		b->conds = dsNew(cols);
	
		scan_block(b_genes, b);
		if (b->block_cols == 0) continue;
		b->block_rows = components;
                b->score = b->block_rows * b->block_cols;		
		dsClear(b->genes);
		for ( ki=0; ki < components; ki++)
			dsPush(b->genes,dsItem(genes,ki));
		for(ki = 0; ki < components; ki++)
			if(!isInStack(allincluster, dsItem(genes,ki))) dsPush(allincluster,dsItem(genes,ki));	

		bb[block_id++] = b;

        /* reaching the results number limit */
		if (block_id == po->SCH_BLOCK) break;
		verboseDot();	

	}

	putchar('\n');
    /* free-up the candidate list */
	free(candidates);
	free(allincluster);

	return report_blocks(fw, bb, block_id);
}
Ejemplo n.º 5
0
static void block_init(Edge *e, Block *b, 
                     struct dyStack *genes, struct dyStack *scores,
                     bool *candidates, const int cand_threshold,
                     int *components, struct dyStack *allincluster)
{
	int i,score,top;
	int cnt = 0;
	int max_cnt, max_i;
	int *arr_rows, *arr_rows_b;
	AllocArray(arr_rows, rows);
	AllocArray(arr_rows_b, rows);	
	bool *colcand;
	AllocArray(colcand, cols);
	for (i=0; i< cols; i++) colcand[i] = FALSE;
	discrete *g1, *g2;
	g1 = arr_c[dsItem(genes,0)];
	g2 = arr_c[dsItem(genes,1)];
	for (i=0; i< cols; i++)
		if ((g1[i] == g2[i])&&(g1[i]!=0)) colcand[i] = TRUE;

	for (i = 0; i < rows; i++)
	{
		arr_rows[i] = intersect_row(colcand, arr_c[dsItem(genes,0)], arr_c[i]);
		arr_rows_b[i] = arr_rows[i];
	}
	if (rows > 100)
	{		
		qsort(arr_rows_b, rows, sizeof *arr_rows, compare_int);
		top = arr_rows_b[rows -100];
		for (i = 0; i < rows; i++)
		if (arr_rows[i] < top) candidates[i] = FALSE;
	}

	while (*components < rows)
	{
		max_cnt = -1;
		max_i = -1;
		(*components)++;
		for (i=0; i< rows; i++)
		{
			if (!candidates[i]) continue;

			cnt = intersect_row(colcand,arr_c[dsItem(genes,0)],arr_c[i]);
			if (cnt < cand_threshold) candidates[i] = FALSE;
			if (cnt > max_cnt)
			{
				max_cnt = cnt;
				max_i = i;
			}
		}
		if (max_cnt < po->COL_WIDTH || max_i < 0) break;
		else
		{	
            score = MIN(*components, max_cnt);
			if (score > b->score) b->score = score;
			dsPush(genes, max_i);
			dsPush(scores,score);
			update_colcand(colcand,arr_c[dsItem(genes,0)], arr_c[max_i]);
			candidates[max_i] = FALSE;
		}
	}
	free(colcand);
}
Ejemplo n.º 6
0
}
/* Read the .block file, get components and colcand */
void read_and_solve_blocks(FILE *fb, const char *fn)
{
	init_expand();
	size_t n;
	int col;
	char *line = NULL;
	int bnumber = 0;
	struct dyStack *ge, *co;
	int i, components, m_cnt;
	bool *colcand;
	bool *candidates;
	Block *b;
	AllocVar(b);
	AllocArray(colcand, another_cols);
	AllocArray(candidates, another_rows);
	ge = dsNew(another_rows);
	co = dsNew(another_cols);
	FILE *fo = mustOpen(fn, "w");

	/* main course starts here */
	while (getline(&line, &n, fb) != -1)
	{
	        /* fast forward to a line that contains BC*/
		/* strncmp compares up to num characters of the C string str1 to those of the C string str2
		 * strncmp ( const char * str1, const char * str2, size_t num )*/
		while (strncmp(line, "BC", 2)!=0) 
		{
			if (getline(&line, &n, fb)==-1) 
				exit(0);
		}
		components = 0;
		col = 0;
		dsClear(ge);
		dsClear(co);
		for (i=0; i< another_cols; i++)
			colcand[i] = FALSE;
		for (i=0; i< another_rows; i++)
			candidates[i] = TRUE;
		/* read genes from block */		
		getline(&line, &n, fb);
		atom = strtok(line, delims);
		atom = strtok(NULL, delims);
		while((atom = strtok(NULL, delims)) != NULL)
		{
			/* look up for genes number */
			if (strlen(atom) == 0) continue;			
			for(i=0; i<another_rows; i++)
			{
				if (strcmp(atom ,another_genes[i]) == 0) break;
			}
			candidates[i] = FALSE;			
			dsPush(ge, i);
			components++;
		}
		/* read conditions from block */
		getline(&line, &n, fb);
		atom = strtok(line, delims);
		atom = strtok(NULL, delims);
		while((atom = strtok(NULL, delims)) != NULL)
		{
			/*if (strlen(atom) < 5) break;*/			
			if (strlen(atom) == 0) continue;			
			for(i=0; i<another_cols; i++)
				if (strcmp(atom, another_conds[i]) == 0) break;
			colcand[i] = TRUE;
			dsPush(co, i);
			col++;
		}
		
		b->block_rows_pre = components;
		/* add some possible genes */
		for( i = 0; i < another_rows; i++)
		{
			m_cnt = intersect_row(colcand, another_arr_c[dsItem(ge,0)], another_arr_c[i], another_cols);
			printf ("%d\n",m_cnt);
			if( candidates[i] && (m_cnt >= (int)floor( (double)col * po->TOLERANCE)) )
			{
				dsPush(ge,i);
				components++;
				candidates[i] = FALSE;
			}
		}
		/* add genes that negative regulated to the consensus */
		for( i = 0; i < another_rows; i++)
		{
			m_cnt = reverse_row(colcand, another_arr_c[dsItem(ge,0)], another_arr_c[i], another_cols);
			if( candidates[i] && (m_cnt >= (int)floor( (double)col * po->TOLERANCE)) )
			{
				dsPush(ge,i);
				components++;
				candidates[i] = FALSE;
			}
		}
		if(dsSize(ge) > 1)
		{		
			store_block(b, ge, co);
			/*another_print_bc(fo, b, bnumber);*/
			print_bc(fo, b, bnumber++);
		}