Пример #1
0
/*************************************************
 Function:
    loadUpdatedEdges
 Description:
    Loads contig information and masks some contigs according to setting.
 Input:
    1. graphfile:       prefix of graph file
 Output:
    None.
 Return:
    None.
 *************************************************/
void loadUpdatedEdges ( char * graphfile )
{
	char c, name[256], line[1024];
	int bal_ed, cvg;
	FILE * fp, *out_fp;
	Kmer from_kmer, to_kmer;
	unsigned int num_ctgge, length, index = 0, num_kmer;
	unsigned int i = 0, j;
	int newIndex;
	unsigned int * length_array, *flag_array, diff_len;
	char * outfile = graphfile;
	long long cvgSum = 0;
	long long counter = 0;
	unsigned int avg_arc_wt;
	int ctg_short_cutoff;
	float high_cvg_cutoff1, high_cvg_cutoff2, low_cvg_cutoff;
	int cut_len;
	//get overlaplen from *.preGraphBasic
	sprintf ( name, "%s.preGraphBasic", graphfile );
	fp = ckopen ( name, "r" );

	while ( fgets ( line, sizeof ( line ), fp ) != NULL )
	{
		if ( line[0] == 'V' )
		{
			sscanf ( line + 6, "%d %c %d", &num_kmer, &c, &overlaplen );
			fprintf ( stderr, "Kmer size: %d\n", overlaplen );
			break;
		}
	}

	cut_len = COMPATIBLE_MODE == 0 ? overlaplen : 0;

	if ( ctg_short == 0 )
	{
		ctg_short = overlaplen + 2;
	}

	ctg_short_cutoff = 2 * overlaplen + 2 < 100 ? 100 : 0;
	fclose ( fp );
	sprintf ( name, "%s.updated.edge", graphfile );
	fp = ckopen ( name, "r" );
	sprintf ( name, "%s.newContigIndex", outfile );
	out_fp = ckopen ( name, "w" );

	while ( fgets ( line, sizeof ( line ), fp ) != NULL )
	{
		if ( line[0] == 'E' )
		{
			sscanf ( line + 5, "%d", &num_ctgge );
			fprintf ( stderr, "There are %d edge(s) in edge file.\n", num_ctgge );
			break;
		}
	}

	index_array = ( unsigned int * ) ckalloc ( ( num_ctgge + 1 ) * sizeof ( unsigned int ) );
	length_array = ( unsigned int * ) ckalloc ( ( num_ctgge + 1 ) * sizeof ( unsigned int ) );
	flag_array = ( unsigned int * ) ckalloc ( ( num_ctgge + 1 ) * sizeof ( unsigned int ) );

	while ( fgets ( line, sizeof ( line ), fp ) != NULL )
	{
		if ( line[0] == '>' )
		{
			sscanf ( line + 7, "%d", &length );
			index_array[++index] = length;
			length_array[++i] = length;
		}
	}

	num_ctg = index;
	orig2new = 1;
	qsort ( & ( length_array[1] ), num_ctg, sizeof ( length_array[0] ), cmp_int );
	//extract unique length
	diff_len = 0;

	for ( i = 1; i <= num_ctg; i++ )
	{
		for ( j = i + 1; j <= num_ctg; j++ )
			if ( length_array[j] != length_array[i] )
			{
				break;
			}

		length_array[++diff_len] = length_array[i];
		flag_array[diff_len] = i;
		i = j - 1;
	}

	contig_array = ( CONTIG * ) ckalloc ( ( num_ctg + 1 ) * sizeof ( CONTIG ) );
	//load edges
	index = 0;
	rewind ( fp );

	while ( fgets ( line, sizeof ( line ), fp ) != NULL )
	{
		if ( line[0] == '>' )
		{
			sscanf ( line, ">length %u,%d,%d", &length, &bal_ed, &cvg );
			newIndex = uniqueLenSearch ( length_array, flag_array, diff_len, length );
			index_array[++index] = newIndex;

			if ( length != 0 ) { contig_array[newIndex].length = length - cut_len; }
			else  { contig_array[newIndex].length = 0; }

			contig_array[newIndex].bal_edge = bal_ed + 1;
			contig_array[newIndex].downwardConnect = NULL;
			contig_array[newIndex].mask = 0;
			contig_array[newIndex].flag = 0;
			contig_array[newIndex].arcs = NULL;
			contig_array[newIndex].seq = NULL;
			contig_array[newIndex].multi = 0;
			contig_array[newIndex].inSubGraph = 0;
			contig_array[newIndex].bubbleInScaff = 0;
			contig_array[newIndex].cvg = cvg / 10;

			if ( cvg && length > 100 )
			{
				counter += length - cut_len;
				cvgSum += cvg * ( length - cut_len );
			}

			fprintf ( out_fp, "%d %d %d\n", index, newIndex, contig_array[newIndex].bal_edge );
		}
	}

	if ( counter )
	{
		cvgAvg = cvgSum / counter / 10 > 2 ? cvgSum / counter / 10 : 3;
	}

	//mark repeats
	int bal_i;

	if ( maskRep )
	{
		high_cvg_cutoff1 = cvg_high * cvgAvg;
		high_cvg_cutoff2 = cvg_high * cvgAvg * 0.8;
		low_cvg_cutoff = cvg_low * cvgAvg;
		counter = 0;
		fprintf ( stderr, "Mask contigs with coverage lower than %.1f or higher than %.1f, and strict length %d.\n", low_cvg_cutoff, high_cvg_cutoff1, ctg_short_cutoff );

		for ( i = 1; i <= num_ctg; i++ )
		{
			bal_i = getTwinCtg ( i );

			if ( ( contig_array[i].cvg + contig_array[bal_i].cvg ) > 2 * high_cvg_cutoff1 )
			{
				contig_array[i].mask = 1;
				contig_array[bal_i].mask = 1;

				if ( i == bal_i ) { counter += 1; }
				else { counter += 2; }
			}
			else if ( contig_array[i].length < ctg_short_cutoff && ( contig_array[i].cvg > high_cvg_cutoff2 || contig_array[bal_i].cvg > high_cvg_cutoff2 || ( contig_array[i].cvg < low_cvg_cutoff && contig_array[bal_i].cvg < low_cvg_cutoff ) ) )
			{
				contig_array[i].mask = 1;
				contig_array[bal_i].mask = 1;

				if ( i == bal_i ) { counter += 1; }
				else { counter += 2; }
			}
			else if ( cvgAvg < 50 && ( contig_array[i].cvg >= 63 || contig_array[bal_i].cvg >= 63 ) )
			{
				contig_array[i].mask = 1;
				contig_array[bal_i].mask = 1;

				if ( i == bal_i ) { counter += 1; }
				else { counter += 2; }
			}

			if ( isSmallerThanTwin ( i ) )
			{
				i++;
			}
		}

		fprintf ( stderr, "Average contig coverage is %d, %lld contig(s) masked.\n", cvgAvg, counter );
	}

	counter = 0;

	for ( i = 1; i <= num_ctg; i++ )
	{
		if ( contig_array[i].mask )
		{
			continue;
		}

		bal_i = getTwinCtg ( i );

		if ( contig_array[i].length < ctg_short )
		{
			contig_array[i].mask = 1;
			contig_array[bal_i].mask = 1;

			if ( i == bal_i ) { counter += 1; }
			else { counter += 2; }
		}

		if ( isSmallerThanTwin ( i ) )
		{
			i++;
		}
	}

	fprintf ( stderr, "Mask contigs shorter than %d, %lld contig(s) masked.\n", ctg_short, counter );
	avg_arc_wt = loadArcs ( graphfile );
	counter = 0;
	//counter = maskRepeatByArc(avg_arc_wt);
	//printf ("Mask contigs with multi arcs, %d contig masked\n", counter);
	//tipsCount();
	loadContig ( graphfile );
	fprintf ( stderr, "Done loading updated edges.\n" );
	free ( ( void * ) length_array );
	free ( ( void * ) flag_array );
	fclose ( fp );
	fclose ( out_fp );
}
Пример #2
0
void output_contig (EDGE * ed_array, unsigned int ed_num, char *outfile, int cut_len) 
{
	char temp[256];

	FILE * fp, *fp_contig;
	int flag, count, len_c;
	int signI;
	unsigned int i, j, diff_len=0;
	long long sum = 0, N90, N50;
	unsigned int *length_array;

	boolean tip;
	sprintf (temp, "%s.contig", outfile);
	fp = ckopen (temp, "w");

	index_array = (unsigned int *)ckalloc((ed_num+1)*sizeof(unsigned int));
	unsigned int * all_length_arr = (unsigned int*) ckalloc((ed_num+1)*sizeof(unsigned int));
	flag_array = (unsigned int*)ckalloc((ed_num+1)*sizeof(unsigned int));

	for (i=1; i<=ed_num; ++i)
	{
		index_array[i] = ed_array[i].length;
		all_length_arr[i] = ed_array[i].length;
	}

	qsort(&all_length_arr[1], ed_num, sizeof(all_length_arr[0]), cmp_int);

	for (i=1; i<=ed_num; ++i)
	{
		for (j=i+1; j<=ed_num; ++j)
		{
			if (all_length_arr[i] != all_length_arr[j])
				break;
		}
		all_length_arr[++diff_len] = all_length_arr[i];
		flag_array[diff_len] = i;
		i = j-1;

	}

	for (i=1; i<=ed_num; ++i)
	{
		index_array[i] = uniqueLenSearch(all_length_arr, flag_array, diff_len, index_array[i]);
	}

	for (i=1; i<=ed_num; ++i)
	{
		flag_array[index_array[i]] = i;
	}

	free((void*)all_length_arr);

	length_array = (unsigned int *) ckalloc (ed_num * sizeof (unsigned int));
	kmerSeq = (char *) ckalloc (overlaplen * sizeof (char));

	count = len_c = 0;
	for (i = 1; i <= ed_num; i++)
	{
		if ((ed_array[i].length + overlaplen) >= len_bar)	
		{
			length_array[len_c++] = ed_array[i].length + overlaplen;
		}
		if (ed_array[i].length < 1 || ed_array[i].deleted)	
		{
			continue;
		}
		count++;
		if (EdSmallerThanTwin (i))		
		{
			i++;
		}
	}
	sum = 0;
	for (signI = len_c - 1; signI >= 0; signI--)	
	{
		sum += length_array[signI];
	}
	
	qsort ( length_array, len_c, sizeof ( length_array[0] ), cmp_int );

	if ( len_c > 0 )
	{
		printf ( "%d ctgs longer than %d, sum up %lldbp, with average length %lld\n", len_c, len_bar, sum, sum / len_c );
		printf ( "the longest is %dbp, ", length_array[len_c - 1] );
	}
	
	N50 = sum * 0.5;
	N90 = sum * 0.9;
	sum = flag = 0;
	for (signI = len_c - 1; signI >= 0; signI--)
	{
		sum += length_array[signI];
		if (!flag && sum >= N50)
		{
			printf ("contig N50 is %d bp,", length_array[signI]);
			flag = 1;
		}
		if (sum >= N90)	
		{
			printf ("contig N90 is %d bp\n", length_array[signI]);
			break;
		}
	}
	
	for (i = 1; i <= ed_num; i++)
	{
		j = flag_array[i];
		if (ed_array[j].deleted || ed_array[j].length < 1)	
		{
			continue;
		}
		if (ed_array[j].arcs && ed_array[getTwinEdge (j)].arcs)
		{
			tip = 0;
		}
		
		else	
		{
			tip = 1;
		}
		output_1contig (i, &(ed_array[j]), fp, tip);
		if (EdSmallerThanTwin (j))		
		{
			i++;
		}
	}

	fclose (fp);
	free ((void *) kmerSeq);
	free ((void *) length_array);
	printf ("%d contigs longer than %d output\n", count, cut_len);
	sprintf (temp, "%s.ContigIndex", outfile);
	fp_contig = ckopen (temp, "w");
	fprintf (fp_contig, "Edge_num %d %d\n", ed_num, count);
	fprintf (fp_contig, "index\tlength\treverseComplement\n");

	for (i = 1; i <= num_ed; i++)
	{
		j = flag_array[i];
		fprintf (fp_contig, "%d\t%d\t", i, edge_array[j].length + overlaplen);
		if (EdSmallerThanTwin (j))
		{
			fprintf (fp_contig, "1\n");
			i++;
		}
		
		else if (EdLargerThanTwin (j))
		{
			fprintf (fp_contig, "-1\n");
		}
		
		else
		{
			fprintf (fp_contig, "0\n");
		}
	}
	fclose (fp_contig);
}