/************************************************* Function: loadUpdatedEdges Description: Loads contig information and masks some contigs according to setting. Input: 1. graphfile: prefix of graph file Output: None. Return: None. *************************************************/ void loadUpdatedEdges ( char * graphfile ) { char c, name[256], line[1024]; int bal_ed, cvg; FILE * fp, *out_fp; Kmer from_kmer, to_kmer; unsigned int num_ctgge, length, index = 0, num_kmer; unsigned int i = 0, j; int newIndex; unsigned int * length_array, *flag_array, diff_len; char * outfile = graphfile; long long cvgSum = 0; long long counter = 0; unsigned int avg_arc_wt; int ctg_short_cutoff; float high_cvg_cutoff1, high_cvg_cutoff2, low_cvg_cutoff; int cut_len; //get overlaplen from *.preGraphBasic sprintf ( name, "%s.preGraphBasic", graphfile ); fp = ckopen ( name, "r" ); while ( fgets ( line, sizeof ( line ), fp ) != NULL ) { if ( line[0] == 'V' ) { sscanf ( line + 6, "%d %c %d", &num_kmer, &c, &overlaplen ); fprintf ( stderr, "Kmer size: %d\n", overlaplen ); break; } } cut_len = COMPATIBLE_MODE == 0 ? overlaplen : 0; if ( ctg_short == 0 ) { ctg_short = overlaplen + 2; } ctg_short_cutoff = 2 * overlaplen + 2 < 100 ? 100 : 0; fclose ( fp ); sprintf ( name, "%s.updated.edge", graphfile ); fp = ckopen ( name, "r" ); sprintf ( name, "%s.newContigIndex", outfile ); out_fp = ckopen ( name, "w" ); while ( fgets ( line, sizeof ( line ), fp ) != NULL ) { if ( line[0] == 'E' ) { sscanf ( line + 5, "%d", &num_ctgge ); fprintf ( stderr, "There are %d edge(s) in edge file.\n", num_ctgge ); break; } } index_array = ( unsigned int * ) ckalloc ( ( num_ctgge + 1 ) * sizeof ( unsigned int ) ); length_array = ( unsigned int * ) ckalloc ( ( num_ctgge + 1 ) * sizeof ( unsigned int ) ); flag_array = ( unsigned int * ) ckalloc ( ( num_ctgge + 1 ) * sizeof ( unsigned int ) ); while ( fgets ( line, sizeof ( line ), fp ) != NULL ) { if ( line[0] == '>' ) { sscanf ( line + 7, "%d", &length ); index_array[++index] = length; length_array[++i] = length; } } num_ctg = index; orig2new = 1; qsort ( & ( length_array[1] ), num_ctg, sizeof ( length_array[0] ), cmp_int ); //extract unique length diff_len = 0; for ( i = 1; i <= num_ctg; i++ ) { for ( j = i + 1; j <= num_ctg; j++ ) if ( length_array[j] != length_array[i] ) { break; } length_array[++diff_len] = length_array[i]; flag_array[diff_len] = i; i = j - 1; } contig_array = ( CONTIG * ) ckalloc ( ( num_ctg + 1 ) * sizeof ( CONTIG ) ); //load edges index = 0; rewind ( fp ); while ( fgets ( line, sizeof ( line ), fp ) != NULL ) { if ( line[0] == '>' ) { sscanf ( line, ">length %u,%d,%d", &length, &bal_ed, &cvg ); newIndex = uniqueLenSearch ( length_array, flag_array, diff_len, length ); index_array[++index] = newIndex; if ( length != 0 ) { contig_array[newIndex].length = length - cut_len; } else { contig_array[newIndex].length = 0; } contig_array[newIndex].bal_edge = bal_ed + 1; contig_array[newIndex].downwardConnect = NULL; contig_array[newIndex].mask = 0; contig_array[newIndex].flag = 0; contig_array[newIndex].arcs = NULL; contig_array[newIndex].seq = NULL; contig_array[newIndex].multi = 0; contig_array[newIndex].inSubGraph = 0; contig_array[newIndex].bubbleInScaff = 0; contig_array[newIndex].cvg = cvg / 10; if ( cvg && length > 100 ) { counter += length - cut_len; cvgSum += cvg * ( length - cut_len ); } fprintf ( out_fp, "%d %d %d\n", index, newIndex, contig_array[newIndex].bal_edge ); } } if ( counter ) { cvgAvg = cvgSum / counter / 10 > 2 ? cvgSum / counter / 10 : 3; } //mark repeats int bal_i; if ( maskRep ) { high_cvg_cutoff1 = cvg_high * cvgAvg; high_cvg_cutoff2 = cvg_high * cvgAvg * 0.8; low_cvg_cutoff = cvg_low * cvgAvg; counter = 0; fprintf ( stderr, "Mask contigs with coverage lower than %.1f or higher than %.1f, and strict length %d.\n", low_cvg_cutoff, high_cvg_cutoff1, ctg_short_cutoff ); for ( i = 1; i <= num_ctg; i++ ) { bal_i = getTwinCtg ( i ); if ( ( contig_array[i].cvg + contig_array[bal_i].cvg ) > 2 * high_cvg_cutoff1 ) { contig_array[i].mask = 1; contig_array[bal_i].mask = 1; if ( i == bal_i ) { counter += 1; } else { counter += 2; } } else if ( contig_array[i].length < ctg_short_cutoff && ( contig_array[i].cvg > high_cvg_cutoff2 || contig_array[bal_i].cvg > high_cvg_cutoff2 || ( contig_array[i].cvg < low_cvg_cutoff && contig_array[bal_i].cvg < low_cvg_cutoff ) ) ) { contig_array[i].mask = 1; contig_array[bal_i].mask = 1; if ( i == bal_i ) { counter += 1; } else { counter += 2; } } else if ( cvgAvg < 50 && ( contig_array[i].cvg >= 63 || contig_array[bal_i].cvg >= 63 ) ) { contig_array[i].mask = 1; contig_array[bal_i].mask = 1; if ( i == bal_i ) { counter += 1; } else { counter += 2; } } if ( isSmallerThanTwin ( i ) ) { i++; } } fprintf ( stderr, "Average contig coverage is %d, %lld contig(s) masked.\n", cvgAvg, counter ); } counter = 0; for ( i = 1; i <= num_ctg; i++ ) { if ( contig_array[i].mask ) { continue; } bal_i = getTwinCtg ( i ); if ( contig_array[i].length < ctg_short ) { contig_array[i].mask = 1; contig_array[bal_i].mask = 1; if ( i == bal_i ) { counter += 1; } else { counter += 2; } } if ( isSmallerThanTwin ( i ) ) { i++; } } fprintf ( stderr, "Mask contigs shorter than %d, %lld contig(s) masked.\n", ctg_short, counter ); avg_arc_wt = loadArcs ( graphfile ); counter = 0; //counter = maskRepeatByArc(avg_arc_wt); //printf ("Mask contigs with multi arcs, %d contig masked\n", counter); //tipsCount(); loadContig ( graphfile ); fprintf ( stderr, "Done loading updated edges.\n" ); free ( ( void * ) length_array ); free ( ( void * ) flag_array ); fclose ( fp ); fclose ( out_fp ); }
void output_contig (EDGE * ed_array, unsigned int ed_num, char *outfile, int cut_len) { char temp[256]; FILE * fp, *fp_contig; int flag, count, len_c; int signI; unsigned int i, j, diff_len=0; long long sum = 0, N90, N50; unsigned int *length_array; boolean tip; sprintf (temp, "%s.contig", outfile); fp = ckopen (temp, "w"); index_array = (unsigned int *)ckalloc((ed_num+1)*sizeof(unsigned int)); unsigned int * all_length_arr = (unsigned int*) ckalloc((ed_num+1)*sizeof(unsigned int)); flag_array = (unsigned int*)ckalloc((ed_num+1)*sizeof(unsigned int)); for (i=1; i<=ed_num; ++i) { index_array[i] = ed_array[i].length; all_length_arr[i] = ed_array[i].length; } qsort(&all_length_arr[1], ed_num, sizeof(all_length_arr[0]), cmp_int); for (i=1; i<=ed_num; ++i) { for (j=i+1; j<=ed_num; ++j) { if (all_length_arr[i] != all_length_arr[j]) break; } all_length_arr[++diff_len] = all_length_arr[i]; flag_array[diff_len] = i; i = j-1; } for (i=1; i<=ed_num; ++i) { index_array[i] = uniqueLenSearch(all_length_arr, flag_array, diff_len, index_array[i]); } for (i=1; i<=ed_num; ++i) { flag_array[index_array[i]] = i; } free((void*)all_length_arr); length_array = (unsigned int *) ckalloc (ed_num * sizeof (unsigned int)); kmerSeq = (char *) ckalloc (overlaplen * sizeof (char)); count = len_c = 0; for (i = 1; i <= ed_num; i++) { if ((ed_array[i].length + overlaplen) >= len_bar) { length_array[len_c++] = ed_array[i].length + overlaplen; } if (ed_array[i].length < 1 || ed_array[i].deleted) { continue; } count++; if (EdSmallerThanTwin (i)) { i++; } } sum = 0; for (signI = len_c - 1; signI >= 0; signI--) { sum += length_array[signI]; } qsort ( length_array, len_c, sizeof ( length_array[0] ), cmp_int ); if ( len_c > 0 ) { printf ( "%d ctgs longer than %d, sum up %lldbp, with average length %lld\n", len_c, len_bar, sum, sum / len_c ); printf ( "the longest is %dbp, ", length_array[len_c - 1] ); } N50 = sum * 0.5; N90 = sum * 0.9; sum = flag = 0; for (signI = len_c - 1; signI >= 0; signI--) { sum += length_array[signI]; if (!flag && sum >= N50) { printf ("contig N50 is %d bp,", length_array[signI]); flag = 1; } if (sum >= N90) { printf ("contig N90 is %d bp\n", length_array[signI]); break; } } for (i = 1; i <= ed_num; i++) { j = flag_array[i]; if (ed_array[j].deleted || ed_array[j].length < 1) { continue; } if (ed_array[j].arcs && ed_array[getTwinEdge (j)].arcs) { tip = 0; } else { tip = 1; } output_1contig (i, &(ed_array[j]), fp, tip); if (EdSmallerThanTwin (j)) { i++; } } fclose (fp); free ((void *) kmerSeq); free ((void *) length_array); printf ("%d contigs longer than %d output\n", count, cut_len); sprintf (temp, "%s.ContigIndex", outfile); fp_contig = ckopen (temp, "w"); fprintf (fp_contig, "Edge_num %d %d\n", ed_num, count); fprintf (fp_contig, "index\tlength\treverseComplement\n"); for (i = 1; i <= num_ed; i++) { j = flag_array[i]; fprintf (fp_contig, "%d\t%d\t", i, edge_array[j].length + overlaplen); if (EdSmallerThanTwin (j)) { fprintf (fp_contig, "1\n"); i++; } else if (EdLargerThanTwin (j)) { fprintf (fp_contig, "-1\n"); } else { fprintf (fp_contig, "0\n"); } } fclose (fp_contig); }