int findspans ( int start, int end, char *seq, char *seqname ) { int i ; int sc = 0 ; int lsc = 0 ; int imn = -1 ; /* else sequences which start with pattern reported badly */ int imx = 0 ; int mx = 0 ; int ncpg, ngpc, ngc ; i = start ; while ( i < end ) { lsc = sc ; sc = ( end-1-i && seq[i]=='C' && seq[i+1]=='G' ) ? sc += CPGSCORE : --sc ; sc = sc < 0 ? 0 : sc ; /* printf("%d \t %d \t%d \t %d \t%d \t%d\n", i, sc, lsc, imn, imx, mx) ; */ if ( sc == 0 && lsc ) /* should threshold here */ { /* imn+1 is the start of the match. imx+1 is the end of the match, given pattern-length=2. fetchdb using reported co-ordinates will return a sequence which both starts and ends with a complete pattern. Further +1 adjusts so that start of sequence = 1 */ getstats ( imn+1, imx+2, seq, seqname, &ncpg, &ngpc, &ngc ) ; printf("%s\t %d\t %d\t %d\t CpG: %d\t %.1f\t %.1f\n", seqname, imn+2, imx+2, mx, ncpg, ngc*100.0/(imx+1-imn), 1.0*ncpg/ngpc) ; /* printf("%s \t %d\t %d\t %d \n", seqname, imn+2, imx+2, mx ) ; */ /* Recursive call searches from one position after the end of the last match to the current position */ findspans( imx+2, i, seq, seqname ) ; sc = lsc = imn = imx = mx = 0 ; } imx = sc > mx ? i : imx ; mx = sc > mx ? sc : mx ; imn = sc == 0 ? i : imn ; ++i ; } if ( sc != 0 ) /* should threshold here */ { /* printf("%d \t %d \t%d \t %d \t%d \t%d\n", i, sc, lsc, imn, imx, mx) ; */ getstats ( imn+1, imx+2, seq, seqname, &ncpg, &ngpc, &ngc ) ; printf("%s\t %d\t %d\t %d\t CpG: %d\t %.1f\t %.2f\n", seqname, imn+2, imx+2, mx, ncpg, ngc*100.0/(imx+1-imn), 1.0*ncpg/ngpc) ; /* printf("%s \t %d\t %d\t %d \n", seqname, imn+2, imx+2, mx ) ; */ findspans( imx+2, end, seq, seqname ) ; } }
/*------------------------------------------------------*/ void main (int argc, char **argv) { char *seq, *seqname, *desc ; int conv[] = { 0, 1, 2, 3, 4 } ; int length ; int i ; static FILE *fil ; char c, *cp ; extern char* malloc() ; /*------------------------------------------------------*/ switch ( argc ) { default: if (argc != 2) usage () ; } if (!(seqname = malloc (MAXNAMELEN+1))) { fprintf (stderr, "Couldn't malloc %d bytes", MAXNAMELEN) ; exit (-1) ; } if (!(seq = malloc (MALLOCBLOCK+1))) { fprintf (stderr, "Couldn't malloc %d bytes", MALLOCBLOCK) ; exit (-1) ; } if (!(fil = fopen ( argv[1], "r" ))) usage (); while ( readSequence(fil, dna2textConv, &seq, &seqname, &desc, &length) ) /* once through per sequence */ { i = 0 ; while ( seqname[i] != ' ' && seqname[i] != '\0' && i < 256 ) ++i ; seqname[i] = '\0' ; findspans ( 0, length, seq, seqname ) ; } exit (0); }
void findspans ( int start, int end, char *seq, char *seqname ) { int i ; int sc = 0 ; int lsc = 0 ; int imn = -1 ; /* else sequences which start with pattern reported badly */ int imx = 0 ; int mx = 0 ; int winlen = 0; float expect, obsToExp; int ncpg, ngpc, ngc, ng, nc; i = start ; while ( i < end ) { lsc = sc ; sc += ( end-1-i && seq[i]=='C' && seq[i+1]=='G' ) ? CPGSCORE : -1 ; sc = sc < 0 ? 0 : sc ; /* printf("%d \t %d \t%d \t %d \t%d \t%d\n", i, sc, lsc, imn, imx, mx) ; */ if ( sc == 0 && lsc ) /* should threshold here */ { /* imn+1 is the start of the match. imx+1 is the end of the match, given pattern-length=2. fetchdb using reported co-ordinates will return a sequence which both starts and ends with a complete pattern. Further +1 adjusts so that start of sequence = 1 */ getstats ( imn+1, imx+2, seq, seqname, &ncpg, &ngpc, &ng, &nc ) ; ngc = ng + nc; if (((imx+2)-(imn+2))>199 && (ngc*100.0/(imx+1-imn))>50.0 ) { /* old gos estimate printf("%s\t %d\t %d\t %d\t CpG: %d\t %.1f\t %.1f\n", seqname, imn+2, imx+2, mx, ncpg, ngc*100.0/(imx+1-imn), 1.0*ncpg/ngpc) ; */ winlen=imx+1-imn; /* ASH 3/23/04: expected val from Gardiner-Garden & Frommer '87: */ expect = (float)(nc * ng) / (float)winlen; obsToExp = (float)ncpg / expect; if ( obsToExp > 0.60 ) printf("%s\t %d\t %d\t %d\t CpG: %d\t %.1f\t %.2f\t %.2f\n", seqname, imn+2, imx+2, mx, ncpg, ngc*100.0/(imx+1-imn), 1.0*ncpg/ngpc, obsToExp) ; } /* printf("%s \t %d\t %d\t %d \n", seqname, imn+2, imx+2, mx ) ; */ /* Recursive call searches from one position after the end of the last match to the current position */ findspans( imx+2, i, seq, seqname ) ; sc = lsc = imn = imx = mx = 0 ; } imx = sc > mx ? i : imx ; mx = sc > mx ? sc : mx ; imn = sc == 0 ? i : imn ; ++i ; } if ( sc != 0 ) /* should threshold here */ { /* printf("%d \t %d \t%d \t %d \t%d \t%d\n", i, sc, lsc, imn, imx, mx) ; */ /* ASH 3/23/04: Make this test & output consistent w/above. */ getstats ( imn+1, imx+2, seq, seqname, &ncpg, &ngpc, &ng, &nc ) ; ngc = nc + ng; if (((imx+2)-(imn+2))>199 && (ngc*100.0/(imx+1-imn))>50.0 ) { winlen=imx+1-imn; expect = (float)(nc * ng) / (float)winlen; obsToExp = (float)ncpg / expect; if ( obsToExp > 0.60 ) printf("%s\t %d\t %d\t %d\t CpG: %d\t %.1f\t %.2f\t %.2f\n", seqname, imn+2, imx+2, mx, ncpg, ngc*100.0/(imx+1-imn), 1.0*ncpg/ngpc, obsToExp) ; } /* printf("%s \t %d\t %d\t %d \n", seqname, imn+2, imx+2, mx ) ; */ findspans( imx+2, end, seq, seqname ) ; } }