示例#1
0
int main(int argc, char* argv[]) {
    char cpsfilename[MAXBUFFLENGTH];
    char chainfilename[MAXBUFFLENGTH];
    char outfilename[MAXBUFFLENGTH]="";
 
    int marginlength = 0;

    int MAXREC; 

    char c;
    char *pc;
    int x;

    char buff[MAXBUFFLENGTH+1];
    char aux[MAXBUFFLENGTH+1];

    long score;
    int start1,end1,len1,start2,end2,len2;
    char strand1, strand2, chr1[MAXBUFFLENGTH], chr2[MAXBUFFLENGTH];

    int *size, *dq, *dt;
    int a,b,k,i,j,s,m;

    int *position;
    char *strand;
    int *ids;
    int *idg;
    char *type;

    int chridx[MAXCHR+1];
    int chroff[MAXCHR+1];

    char resstr;
    int  rescrd;


    if(argc==1) {
	fprintf(stderr,"Finds matches of the given set of sites (CPS file) in the BLASTZ chain alignment (CHAIN file)\n");
        fprintf(stderr,"Last update by (dp) on Sep 21, 2011\n");
	fprintf(stderr,"Keys:\n -i CPS file (remember to sort by position in ascending order)\n -d CHAIN alignment file\n -o output file\n");
 	fprintf(stderr," -m margin length [0]\n -v suppress verbose output [NO]\n");
	exit(1);
    }

    timestamp_set();
    for(i=1;i<argc;i++) {
	pc = argv[i];
	if(*pc != '-') continue;
        if(*(pc+1) == 'i') {
	   sscanf(argv[++i], "%s", &cpsfilename[0]);
	}
	if(*(pc+1) == 'd') {
	   sscanf(argv[++i], "%s", &chainfilename[0]);
	}
        if(*(pc+1) == 'o') {
           sscanf(argv[++i], "%s", &outfilename[0]);
        }
        if(*(pc+1) == 'm') {
           sscanf(argv[++i], "%i", &marginlength);
        }
        if(*(pc+1) == 'v') {
	   verbose=0;
	} 
    }

    if(outfilename[0]==0) {
	fprintf(stderr,"No output file privided, exiting\n");
	exit(1);
    }
    outfile = fopen(outfilename,"w");
    if(outfile == NULL) {
	fprintf(stderr,"Can't open output file, exiting\n");
	exit(1);
    }

    for(i=0;i<MAXCHR;i++) {
      	chridx[i]=chroff[i]=0;
    }

    MAXREC = 0;
    cpsfile= fopen(cpsfilename,"r");
    if(cpsfile==NULL) {
	fprintf(stderr,"Can't access CPS file. Exiting\n");
	exit(1);
    }

    if(verbose) fprintf(stderr,"Reading CPS input pass 1");

    while(!feof(cpsfile)) {
      	buff[0]=0;
      	fgets(buff,MAXBUFFLENGTH,cpsfile);
      	if(strlen(buff)<2) break;
      	sscanf(buff,"%s" , aux);
      	chridx[assign_code(aux)]++;
	MAXREC++;
    }
    fclose(cpsfile);

    for(s=i=0;i<MAXCHR;i++) {
        x = chridx[i];
	chridx[i] =s;
	s+=x;
    }
    chridx[i] = s;

    position   = (int*)  malloc(sizeof(int)*(s+4));
    strand     = (char*) malloc(sizeof(char)*(s+4));
    type       = (char*) malloc(sizeof(char)*(s+4));
    ids	       = (int*)  malloc(sizeof(int)*(s+4));
    idg        = (int*)  malloc(sizeof(int)*(s+4));

    if(position==NULL || strand==NULL || type==NULL || ids==NULL || idg==NULL) {
        fprintf(stderr,"Not enough memory. Terminated\n");
        exit(1);
    }

    cpsfile= fopen(cpsfilename,"r");
    if(verbose) fprintf(stderr,", records = %i\nReading CPS input pass 2",MAXREC);
    while(!feof(cpsfile)) {
      	buff[0]=0;
      	fgets(buff,MAXBUFFLENGTH,cpsfile);
        if(strlen(buff)<2) break;
        sscanf(buff,"%s" , aux);
        i = assign_code(aux);
        m = chridx[i]+chroff[i];
        sscanf(buff,"%*s %i %c %i %i %c" , position+m, strand+m, idg+m, ids+m, type+m);
        chroff[i]++;
    }
    fclose(cpsfile);

    if(verbose) fprintf(stderr,"\nSorting segments");

/*
    for(i=0;i<MAXCHR;i++) {
	quickSort_ic(position,strand,chridx[i],chridx[i+1]-1);
    }
*/
	

    for(i=0;i<MAXCHR;i++) {
        k=1;
        while(k) {
            k=0;
            for(j=chridx[i];j<chridx[i+1]-1;j++) {
                if(position[j]>position[j+1]) {
                    k=1;
                    swapi(position+j,position+j+1);
                    swapc(strand+j,strand+j+1);
                    swapc(type+j,type+j+1);
		    swapi(ids+j,ids+j+1);
                    swapi(idg+j,idg+j+1);
                }
            }
        }
    }

    if(verbose) fprintf(stderr," done\nProcessing chains");

/**********************************************************************************************/
    size = (int*) malloc(sizeof(int)*MAXALN);
    dq   = (int*) malloc(sizeof(int)*MAXALN);
    dt   = (int*) malloc(sizeof(int)*MAXALN);

    if(size ==0 || dq ==0 || dt==0) {
        fprintf(stderr,"Not enough memory for such long chains. Terminated\n");
        exit(1);
    }


/**********************************************************************************************/

    chainfile = fopen(chainfilename,"r");
    while(!feof(chainfile)) {
     	buff[0]=0;
     	fgets(buff,MAXBUFFLENGTH,chainfile);
     	if(strlen(buff)<2) break;
     	buff[5]=0;
     	if(strcmp(buff,"chain")==0) {
       	    sscanf(buff+6,"%li %s %i %c %i %i %s %i %c %i %i",&score, &chr1[0], &len1, &strand1, &start1, &end1, &chr2[0], &len2, &strand2, &start2, &end2);
	    k=0;
	    while(!feof(chainfile)) {
	    	buff[0]=0;
	    	fgets(buff,MAXBUFFLENGTH,chainfile);
            	if(strlen(buff)<2) break;
	    	sscanf(buff,"%i %i %i",&size[k],&dt[k],&dq[k]);
	    	k++;
	    	if(k>MAXALN) {
		    fprintf(stderr,"Chain length exceeded. Terminating");
		    exit(1);
	    	}
	    }

	    x = get_chr_code(chr1);
	    if(x<0) continue;

            a=start1;b=start2;
            j=0;

	    for(i=chridx[x];i<chridx[x+1] && position[i]<start1;i++);
	    for(;i<chridx[x+1]&& position[i]<end1;i++) {
	    	while(position[i]>a+size[j]+dt[j] && j<k){
		    a+=size[j]+dt[j];
		    b+=size[j]+dq[j];
		    j++;
	    	}
	        if(j>=k) break;
	        if(position[i]-a > marginlength && a+size[j]-position[i] >= marginlength) {
                    if(strand1==strand2) {
                    	resstr = strand[i];
                    	rescrd = position[i] - a + b;
                    }
                    else {
                    	resstr = (strand[i]=='+') ? '-' : '+';
                    	rescrd = len2 - (position[i] - a + b - 1) ;
                    }
		    fprintf(outfile,"%s\t%i\t%c\t%s\t%i\t%c\t%i\t%i\t%c\t%li\n",chr1, position[i], strand[i],chr2,rescrd, resstr, idg[i], ids[i],type[i],score);
	    	}
	    }
     	}
    }
    if(verbose) fprintf(stderr," done\n");
    fclose(chainfile);
    fclose(outfile);
    timestamp_report();
    exit(0);
}
示例#2
0
int main(int argc, char* argv[]) {
    char out_file_name[MAXBUFFLENGTH]="";
    char input_file_name[MAXBUFFLENGTH]="";
    char idx_file_name[MAXBUFFLENGTH];
    char dbx_file_name[MAXBUFFLENGTH];

    long offset;
    long seqlen;
    long intronic_window = 150;
    long exonic_window = 0;

    char filename[MAXBUFFLENGTH];
    char name[MAXBUFFLENGTH];

    char chr_name[MAXBUFFLENGTH];
    char buff[MAXBUFFLENGTH];
    char longbuff[MAXLONGBUFFLENGTH];
    char longbuffm[MAXLONGBUFFLENGTH];

    FILE *idx_file;
    FILE *dbx_file;
    FILE *input_file;
    FILE *outfile;
   
    int b,i,j,q,k,n,a,m,s;
    char c;
    long position,y;
    int strand;

    long** pos;
    int** str;
    char** typ;
    char*** ids;
    long p,l;

    int record_count[MAXCHR];
    int record_idx[MAXCHR];
    int cis = 0;
    int coord = 0;
    int all=0;

    int warnings = 0;

    char format[][64] = {"%*s %*i %*i %s %li %i %*s %s %c", "%s %li %i %*s %s %c"};

    if(argc==1) {
        fprintf(stderr,"This routine get sequence segments from a custom compressed FASTA repository  (see transf)\n");
        fprintf(stderr,"Last update by Dmitri Pervouchine ([email protected]) on Mar 22, 2013\n");
        fprintf(stderr," -in <aln_file>\n -dbx <database_file>\n -idx <index_file>\n -out <output_file>\n");
        fprintf(stderr," -we <exonic_window> [default=%i]\n -wi <intronic_window> [default=%i]\n -cis [use colunms 1-3] [default=%i]\n", exonic_window, intronic_window, cis);
	fprintf(stderr," -quiet <suppress verbose output> [default=no]\n -all <include all sites>\n -coord <offset for 3'-sites> [default=%i]\n",coord);
	exit(1);
    }

    timestamp_set();
    for(i=1;i<argc;i++) {
        if(strcmp(argv[i],"-in")==0) {
            sscanf(argv[++i], "%s", &input_file_name[0]);
        }

	if(strcmp(argv[i],"-dbx")==0) {
            sscanf(argv[++i], "%s", &dbx_file_name[0]);
	}

        if(strcmp(argv[i],"-idx")==0) {
            sscanf(argv[++i], "%s", &idx_file_name[0]);
        }

        if(strcmp(argv[i],"-out")==0) {
            sscanf(argv[++i], "%s", &out_file_name[0]);
        }

        if(strcmp(argv[i],"-we")==0) {
            sscanf(argv[++i], "%li", &exonic_window);
        }

        if(strcmp(argv[i],"-wi")==0) {
            sscanf(argv[++i], "%li", &intronic_window);
        }

        if(strcmp(argv[i],"-coord")==0) {
            sscanf(argv[++i], "%i", &coord);
        }

        if(strcmp(argv[i],"-quiet")==0) {
	    verbose = 0;
	}

        if(strcmp(argv[i],"-cis")==0) {
            cis = 1;
        }

        if(strcmp(argv[i],"-all")==0) {
            all = 1;
        }

    }

    if(out_file_name[0]==0) {
        fprintf(stderr,"[WARNING: output file not specified, redirect to stdout]\n");
        outfile = stdout;
    }
    else {
        outfile = fopen(out_file_name,"w");
        if(outfile == NULL) {
            fprintf(stderr,"[ERROR: output file %s cannot be opened for writing, exiting]\n", out_file_name);
            exit(1);
        }
        if(verbose) fprintf(stderr,"[>%s]\n",out_file_name);
    }

    input_file = fopen(input_file_name,"r");
    if(input_file == NULL) {
        fprintf(stderr,"[ERROR: cannot access %s, exiting]\n", input_file_name);
        exit(1);
    }

    if(verbose) fprintf(stderr,"[<%s, pass 1",input_file_name);
    while(fgets(buff,MAXBUFFLENGTH,input_file)) {
        if(strlen(buff)<2) break;
      	sscanf(buff, format[cis], &chr_name[0], &position, &strand, &name[0], &c);
      	n = assign_code(chr_name);
      	record_count[n]++;
    }
    if(verbose) fprintf(stderr,"]\n");

    pos = (long**) malloc(sizeof(long*)*(N_CHR_NAMES+1));
    str = (int**)  malloc(sizeof(int*)*(N_CHR_NAMES+1));
    typ = (char**) malloc(sizeof(char*)*(N_CHR_NAMES+1));
    ids = (char***)malloc(sizeof(char**)*(N_CHR_NAMES+1));

    if(pos==NULL || str==NULL || typ==NULL || ids==NULL) {
        fprintf(stderr,"[ERROR: failed to create index tables, exiting]\n");
        exit(1);
    }

    for(i=0;i<N_CHR_NAMES;i++) {
    	if(record_count[i]>0) {
	    pos[i] = (long*)  malloc(sizeof(long)*(record_count[i]+1));
	    str[i] = (int*)   malloc(sizeof(int)*(record_count[i]+1));
            typ[i] = (char*)  malloc(sizeof(char)*(record_count[i]+1));
	    ids[i] = (char**) malloc(sizeof(char*)*(record_count[i]+1));
	    if(pos[i]==NULL || str[i]==NULL || typ[i]==NULL || ids[i]==NULL) {
        	fprintf(stderr,"[ERROR: failed to create index tables, exiting]\n");
        	exit(1);
    	    }
	    record_idx[i]=0;
	}
    }

    if(verbose) fprintf(stderr,"[<%s, pass 2",input_file_name);
    fseek(input_file,  0, SEEK_SET);
    while(fgets(buff,MAXBUFFLENGTH,input_file)) {
        if(strlen(buff)<2) break;
	sscanf(buff, format[cis], &chr_name[0], &position, &strand, &name[0], &c);
	i = get_chr_code(chr_name);
	j = record_idx[i];
	pos[i][j] = position + (c=='D' ? coord*strand : 0);
	str[i][j] = strand;
	typ[i][j] = c;
	ids[i][j] = (char*) malloc(sizeof(char)*(strlen(name)+1));
	strcpy(ids[i][j],name);
	record_idx[i]++;
    }
    if(verbose) fprintf(stderr,"]\n");

    if(verbose) fprintf(stderr,"[<%s,%s",idx_file_name,dbx_file_name);
    idx_file = fopen(idx_file_name,"r");
    dbx_file = fopen(dbx_file_name,"r");
    if(idx_file == NULL || dbx_file == NULL) {
        fprintf(stderr,"[ERROR: cannot access %s or %s, exiting]\n", idx_file_name, dbx_file_name);
        exit(1);
    }

    offset = 0;
    while(fgets(buff,MAXBUFFLENGTH,idx_file)) {
        if(strlen(buff)<2) break;
        sscanf(buff,"%s" , &name[0]);
	while(fgets(buff,MAXBUFFLENGTH,idx_file)) {
            if(strlen(buff)<2) break;
	    sscanf(buff,"%s %li" , &chr_name[0], &seqlen);
	    i = get_chr_code(chr_name);
	    for(k=0;k<record_count[i];k++) {
               if(pos[i][k]>seqlen) {
		    warnings++;
                    continue;
                }
		l = exonic_window + intronic_window;
		if(typ[i][k]=='D' || typ[i][k]=='A' || all) { 
		    if(str[i][k]>0) {
		    	p = pos[i][k] - 1 - (typ[i][k]=='D' ? exonic_window : intronic_window);
		    }
		    else {
			p = pos[i][k] - (typ[i][k]=='D' ? intronic_window : exonic_window);
		    }
		    fget_segment(longbuff, dbx_file, offset, p, l);
		    if(str[i][k]<0) {
		    	rev1(longbuff);
		    }
               	    if(is_all_n(longbuff)) continue;
		    fprintf(outfile,"%s\t%s\t%li\t%li\t%i\t%li\t%s\t%c\n",ids[i][k], chr_name, (str[i][k]>0 ? p + 1 : seqlen - (p + l)), l, str[i][k], seqlen, longbuff,typ[i][k]);
		}
	    }
            offset+= (seqlen % 8 == 0) ? seqlen/8 : (seqlen/8 + 1);
	}
    }
    fclose(outfile);
    fclose(idx_file);
    fclose(dbx_file);
    if(verbose) fprintf(stderr,"]\n");
    if(verbose && warnings>0) fprintf(stderr,"[WARNING: %i windows were out of range, they were ignored]\n", warnings);

    timestamp_report();
    exit(0);
}
示例#3
0
int main(int argc, char* argv[]) {
    char cps_file_name[MAXBUFFLENGTH];
    char chain_file_name[MAXBUFFLENGTH];
    char out_file_name[MAXBUFFLENGTH]="";
 
    int marginlength = 0;

    char buff[MAXBUFFLENGTH+1];
    char aux[MAXBUFFLENGTH+1];

    long score;
    int start1,end1,len1,start2,end2,len2;
    char strand1, strand2, chr1[MAXBUFFLENGTH], chr2[MAXBUFFLENGTH];

    int *size, *dq, *dt;
    int a,b,k,i,j,s,m,x;

    int *position;
    int *strand;
    int *idg;
    int *ids;
    char *type;
    char c;

    int chridx[MAXCHR+1];
    int chroff[MAXCHR+1];

    char resstr;
    int  rescrd;
    long chain_id;


    if(argc==1) {
        fprintf(stderr,"This utility does liftOver of coordinates (cps) by  using chain alignment\n");
        fprintf(stderr,"Gene information is included in the output\n");
        fprintf(stderr,"Last update by Dmitri Pervouchine ([email protected]) on Mar 22, 2013\n");
        fprintf(stderr,"Usage: %s -in <cps_file> -chain <chain_alignment_file> [-margin <length>] [-quiet]\n", argv[0]);
        fprintf(stderr," -in cps6, i.e. chr1/position1/strand1/gene/site/type tab-delimited file, strand is +/-\n");
        fprintf(stderr," -chain UCSC chain alignment file, species1=>2\n");
        fprintf(stderr," -out <output_file> [default=stdout]\n");
        fprintf(stderr," -margin margin length [default=0]\n -quiet suppress verbose output [default=NO]\n");
	fprintf(stderr,"NOTE: Input has to be sorted by position!\n");
        fprintf(stderr,"Output format cps3+cps6: chr1/position1/strand1/chr2/position2/strand2/gene/site/type/score\n");
        exit(1);
    }

    timestamp_set();
    for(i=1;i<argc;i++) {
	if(strcmp(argv[i],"-in")==0) {
	   sscanf(argv[++i], "%s", &cps_file_name[0]);
	}
	if(strcmp(argv[i],"-chain")==0) {
	   sscanf(argv[++i], "%s", &chain_file_name[0]);
	}
	if(strcmp(argv[i],"-out")==0) {
           sscanf(argv[++i], "%s", &out_file_name[0]);
        }
	if(strcmp(argv[i],"-margin")==0) {
           sscanf(argv[++i], "%i", &marginlength);
        }
	if(strcmp(argv[i],"-quiet")==0) {
	   verbose=0;
	} 
    }

    if(out_file_name[0]==0) {
	fprintf(stderr,"[WARNING: output file not specified, redirect to stdout]\n");
	out_file = stdout;
    }
    else {
    	out_file = fopen(out_file_name,"w");
    	if(out_file == NULL) {
	    fprintf(stderr,"[ERROR: output file %s cannot be opened for writing, exiting]\n", out_file_name);
	    exit(1);
	}
	if(verbose) fprintf(stderr,"[>%s]\n",out_file_name);
    }

    cps_file= fopen(cps_file_name,"r");
    if(cps_file==NULL) {
	fprintf(stderr,"[ERROR: cannot access %s, exiting]\n", cps_file_name);
	exit(1);
    }

    for(i=0;i<MAXCHR;i++) chridx[i] = chroff[i] = 0;

    if(verbose) fprintf(stderr,"[<%s, pass 1",cps_file_name);
    while(fgets(buff,MAXBUFFLENGTH,cps_file)) {
        if(strlen(buff)<2) break;
      	sscanf(buff,"%s" , aux);
      	chridx[assign_code(aux)]++;
    }
    if(verbose) fprintf(stderr,"]\n");

    for(s=i=0;i<MAXCHR;i++) {
        x = chridx[i];
	chridx[i] =s;
	s+=x;
    }
    chridx[i] = s;

    position   = (int*) malloc(sizeof(int)*(s + ARRAY_MARGIN));
    strand     = (int*) malloc(sizeof(int)*(s + ARRAY_MARGIN));
    ids        = (int*) malloc(sizeof(int)*(s + ARRAY_MARGIN));
    idg        = (int*) malloc(sizeof(int)*(s + ARRAY_MARGIN));
    type       = (char*) malloc(sizeof(char)*(s + ARRAY_MARGIN));

    if(position==NULL || strand==NULL || type==NULL || ids==NULL || idg==NULL) {
        fprintf(stderr,"[ERROR: failed to create index tables, exiting]\n");
        exit(1);
    }

    fseek (cps_file, 0, SEEK_SET);
    if(verbose) fprintf(stderr,"[<%s, pass 2", cps_file_name);
    while(fgets(buff,MAXBUFFLENGTH,cps_file)) {
        if(strlen(buff)<2) break;
        sscanf(buff,"%s" , aux);
        i = assign_code(aux);
        m = chridx[i]+chroff[i];
        sscanf(buff,"%*s %i %c %i %i %c" , &position[m], &c,&idg[m],&ids[m],&type[m]);
	strand[m] = strand_c2i(c);
        chroff[i]++;
    }
    fclose(cps_file);
    if(verbose) fprintf(stderr,"]\n");


    if(verbose) fprintf(stderr,"[Sort by position (if not done before)");
    for(i=0;i<MAXCHR;i++) {
        k=1;
        while(k) {
            k=0;
            for(j=chridx[i];j<chridx[i+1]-1;j++) {
                if(position[j]>position[j+1]) {
                    k=1;
                    swapi(position+j,position+j+1);
                    swapi(strand+j,strand+j+1);
		    swapi(idg+j,idg+j+1);
		    swapi(ids+j,ids+j+1);
		    swapc(type+j,type+j+1);
                }
            }
        }
    }
    if(verbose) fprintf(stderr,"]\n");


/**********************************************************************************************/
    size = (int*) malloc(sizeof(int)*(MAXALN + ARRAY_MARGIN));
    dq   = (int*) malloc(sizeof(int)*(MAXALN + ARRAY_MARGIN));
    dt   = (int*) malloc(sizeof(int)*(MAXALN + ARRAY_MARGIN));

    if(size ==0 || dq ==0 || dt==0) {
        fprintf(stderr,"[ERROR: not enough memory for chains, exiting]\n");
        exit(1);
    }


/**********************************************************************************************/

    chain_file = fopen(chain_file_name,"r");
    if(chain_file==NULL) {
	fprintf(stderr,"[ERROR: cannot access %s, exiting]\n", chain_file_name);
	exit(1);
    }

    fseek(chain_file, 0, SEEK_END);
    unsigned int last_pos = ftell(chain_file);
    fseek(chain_file, 0, SEEK_SET);

    while(fgets(buff,MAXBUFFLENGTH,chain_file)) {
        if(strlen(buff)<2) break;
     	buff[5]=0;
     	if(strcmp(buff,"chain")==0) {
       	    sscanf(buff+6,"%li %s %i %c %i %i %s %i %c %i %i %li",&score, &chr1[0], &len1, &strand1, &start1, &end1, &chr2[0], &len2, &strand2, &start2, &end2, &chain_id);
	    k=0;
	    while(fgets(buff,MAXBUFFLENGTH,chain_file)) {
		if(strlen(buff)<2) break;
		progressbar(ftell(chain_file), last_pos-1, (char*)"Processing ", verbose);
	    	sscanf(buff,"%i %i %i",size + k, dt + k, dq + k);
	    	k++;
	    	if(k>=MAXALN) {
		    fprintf(stderr,"[ERROR: chain too long, exiting]\n");
		    exit(1);
	    	}
	    }

	    x = get_chr_code(chr1);
	    if(x<0) continue;

            a=start1;b=start2;
            j=0;

	    for(i=chridx[x];i<chridx[x+1] && position[i]<start1;i++);
	    for(;i<chridx[x+1]&& position[i]<end1;i++) {
	    	while(position[i]>a+size[j]+dt[j] && j<k){
		    a+=size[j]+dt[j];
		    b+=size[j]+dq[j];
		    j++;
	    	}
	        if(j>=k) break;
	        if(position[i]-a > marginlength && a+size[j]-position[i] >= marginlength) {
                    if(strand1==strand2) {
                    	resstr = strand[i];
                    	rescrd = position[i] - a + b;
                    }
                    else {
                    	resstr = -strand[i];
                    	rescrd = len2 - (position[i] - a + b - 1) ;
                    }
                    fprintf(out_file,"%s\t%i\t%c\t%s\t%i\t%c\t%i\t%i\t%c\t%li\n",chr1, position[i], strand_i2c(strand[i]), chr2, rescrd, strand_i2c(resstr), idg[i], ids[i],type[i],score);
	    	}
	    }
     	}
    }
    fclose(chain_file);
    fclose(out_file);
    timestamp_report();

    free(size);
    free(dq);
    free(dt);

    free(position);
    free(strand);
    exit(0);
}
示例#4
0
int main(int argc, char* argv[]) {
    char alnfilename[MAXBUFFLENGTH];
    char cpsfilename[MAXBUFFLENGTH];
    char outfilename[MAXBUFFLENGTH]="";
 
    char c;

    char buff[MAXBUFFLENGTH];
    char aux[MAXBUFFLENGTH];

    int start1,end1,len1,start2,end2,len2;
    char strand1, strand2;

    int a,b,k,l,i,j,s,m,q;
    int d,dmin,lmin,score_max;
    int x,y;

    int qbest, kbest, jbest;
    int kprev;

    int *gene_idx;	//index
    int *gene_off;	//offset  
    int *gene_site;	//site number
    char *gene_styp;	//site type
    int *gene_pos;	//site position
    int *gene_chr;	//chromosome
    char *gene_str;	//strand

    int max_genes;
    int max_sites;

    int *site_idx;	//index
    int *site_off;	//offset
    int *site_chr;	//matching chromosome
    int *site_pos;	//matching pos
    int *site_str;	//matching strand
    int *site_score;	//--- optimal score
    int *site_lbest;    //--- where it came from
    int *site_qbest;    //--- where it came from

    int specific_site=0;

    double dthreshold = 0.50;
    int dlimit = 5000;
    int max_depth = 4;

    int pos, strand;

    if(argc==1) {
	fprintf(stderr,"Select best unique maping from the ALN file created by map_single\n");
        fprintf(stderr,"Last updated by Dmitri Pervouchine ([email protected]) on Jan 28, 2013\n");
	fprintf(stderr,"Keys:\n -in <cps file>\n -aln <aln file>\n -out <output file>\n");
 	//fprintf(stderr," -l length difference limit [%i]\n -t percentage difference threshold [%1.2lf] (ONE OR THE OTHER THRESHOLD IS USED)\n -h max_depth [%i]\n",dlimit, dthreshold,max_depth);
	//fprintf(stderr," -v suppress verbose output [NO]\n");
	exit(1);
    }

    timestamp_set();
    for(i=1;i<argc;i++) {
        if(strcmp(argv[i],"-in")==0) {
            sscanf(argv[++i], "%s", &cpsfilename[0]);
        }
        if(strcmp(argv[i],"-aln")==0) {
            sscanf(argv[++i], "%s", &alnfilename[0]);
        }
        if(strcmp(argv[i],"-out")==0) {
            sscanf(argv[++i], "%s", &outfilename[0]);
        }
        if(strcmp(argv[i],"-lendiff")==0) {
            sscanf(argv[++i], "%i", &dlimit);
        }
        if(strcmp(argv[i],"-threshold")==0) {
            sscanf(argv[++i], "%lf", &dthreshold);
        }
        if(strcmp(argv[i],"-maxdepth")==0) {
            sscanf(argv[++i], "%i", &max_depth);
        }
        if(strcmp(argv[i],"-quiet")==0) {
            verbose=0;
        }
        if(strcmp(argv[i],"-s")==0) {
            sscanf(argv[++i], "%i", &specific_site);
        }
    }

    if(outfilename[0]==0) {
        fprintf(stderr,"[WARNING: output file not specified, redirect to stdout]\n");
        outfile = stdout;
    }
    else {
        outfile = fopen(outfilename,"w");
        if(outfile == NULL) {
            fprintf(stderr,"[ERROR: output file (%s) cannot be opened, exiting]\n", outfilename);
            exit(1);
        }
    }

/*******************************************************************************************************/
    cpsfile= fopen(cpsfilename,"r");
    if(cpsfile==NULL) {
	fprintf(stderr,"Can't access CPS file. Exiting\n");
	exit(1);
    }

    if(verbose) fprintf(stderr,"[Reading CPS, pass 0");
    max_sites = max_genes = 0;
    while(fgets(buff,MAXBUFFLENGTH,cpsfile)) {
      	if(strlen(buff)<2) break;
        sscanf(buff,"%*s %*i %*i %i %i", &i, &j);
        if(i>max_genes) max_genes = i;
	if(j>max_sites) max_sites = j;
    }

    max_genes++;
    max_sites++;

    gene_idx = (int*)  malloc(sizeof(int)*(max_genes+1));
    gene_off = (int*)  malloc(sizeof(int)*(max_genes+1));
    gene_chr = (int*)  malloc(sizeof(int)*(max_genes+1));
    gene_str = (char*) malloc(sizeof(char)*(max_genes+1));

    for(i=0;i<max_genes;i++) gene_idx[i]=gene_off[i]=0;

    if(verbose) fprintf(stderr,", max_genes = %i, max_sites = %i]\n", max_genes, max_sites);

    if(verbose) fprintf(stderr,"[Reading CPS, pass 1");
    fseek (cpsfile, 0, SEEK_SET);
    while(fgets(buff,MAXBUFFLENGTH,cpsfile)) {
        if(strlen(buff)<2) break;
        sscanf(buff,"%s %*i %i %i %*i" , aux, &strand, &i);
        gene_idx[i]++;
	gene_chr[i] = assign_code(aux);
	gene_str[i] = strand;
    }

    for(s=i=0;i<max_genes;i++) {
        x = gene_idx[i];
        gene_idx[i] =s;
        s+=x;
    }
    gene_idx[i] = s;

    gene_site = (int*)  malloc(sizeof(int)*(s+1));
    gene_styp = (char*) malloc(sizeof(char)*(s+1));
    gene_pos  = (int*)  malloc(sizeof(int)*(s+1));

    if(verbose) fprintf(stderr,", records = %i]\n", s);

    if(verbose) fprintf(stderr,"[Reading CPS, pass 2");
    fseek (cpsfile, 0, SEEK_SET);
    while(fgets(buff,MAXBUFFLENGTH,cpsfile)) {
        if(strlen(buff)<2) break;
        sscanf(buff,"%*s %i %*i %i %i %c" , &x, &i, &j, &c);
	gene_site[gene_idx[i]+gene_off[i]]=j;
        gene_styp[gene_idx[i]+gene_off[i]]=c;
        gene_pos[gene_idx[i]+gene_off[i]]=x;
	gene_off[i]++;
    }
    fclose(cpsfile);
    if(verbose) fprintf(stderr,"]\n");
/**********************************************************************************************/

    alnfile = fopen(alnfilename,"r");
    if(alnfile == NULL) {
	fprintf(stderr, "Cant open alignment file, exiting\n");
	exit(1);
    }

    site_idx = (int*) malloc(sizeof(int)*(max_sites+1));
    site_off = (int*) malloc(sizeof(int)*(max_sites+1));

    for(i=0;i<max_sites;i++) {
        site_idx[i]=site_off[i]=0;
    }

    if(verbose) fprintf(stderr,"[Reading alignment file, pass 1");

    while(fgets(buff,MAXBUFFLENGTH,alnfile)) {
        if(strlen(buff)<2) break;
        sscanf(buff,"%*s %*i %*i %*s %*i %*i %*i %i %*c" , &i);
	site_idx[i]++;
    }

    for(s=i=0;i<max_sites;i++) {
        x = site_idx[i];
        site_idx[i] =s;
        s+=x;
    }
    site_idx[i] = s;

    site_chr   = (int*) malloc(sizeof(int)*(s+1));
    site_pos   = (int*) malloc(sizeof(int)*(s+1));
    site_str   = (int*) malloc(sizeof(int)*(s+1));
    site_score = (int*) malloc(sizeof(int)*(s+1));
    site_lbest = (int*) malloc(sizeof(int)*(s+1));
    site_qbest = (int*) malloc(sizeof(int)*(s+1));

    if(verbose) fprintf(stderr,", records = %i]\n",s);

    if(verbose) fprintf(stderr,"[Reading alignment file, pass 2");
    fseek (alnfile, 0, SEEK_SET);
    while(fgets(buff,MAXBUFFLENGTH,alnfile)) {
        if(strlen(buff)<2) break;
        sscanf(buff,"%*s %*i %*i %s %i %i %*i %i %*c" , &aux, &pos, &strand, &i);
	site_chr[site_idx[i]+site_off[i]] = assign_code(aux);
        site_pos[site_idx[i]+site_off[i]] = pos;
        site_str[site_idx[i]+site_off[i]] = strand;
	site_score[site_idx[i]+site_off[i]] = 0;
        site_lbest[site_idx[i]+site_off[i]] = -1;
        site_qbest[site_idx[i]+site_off[i]] = -1;
	site_off[i]++;
    }
    fclose(alnfile);
    if(verbose) fprintf(stderr,"]\n");

    for(i=0;i<max_genes;i++) {
	progressbar(i,max_genes-1, (char*)"Processing");
	score_max=0;
	kbest = -1;
	for(j=gene_idx[i];j<gene_idx[i+1];j++) {
	    x = gene_site[j];
	    for(k=site_idx[x];k<site_idx[x+1];k++) {
		site_score[k] = 0;
		site_lbest[k] = site_qbest[k] = -1;
	    }
	    for(q=1;q<=max_depth;q++) {
            	if(j-q>=gene_idx[i]) {
                    y = gene_site[j-q];
	   	    a = abs(gene_pos[j]-gene_pos[j-q]);
		    for(k=site_idx[x];k<site_idx[x+1];k++) {
			dmin = INFTY;
			lmin = -1;
			for(l=site_idx[y];l<site_idx[y+1];l++) {
			    if(site_chr[k] == site_chr[l] && site_str[k] == site_str[l]) {
			    	b = abs(site_pos[k]-site_pos[l]);
				d = abs(b-a);
				if(d<dmin) {
				    dmin = d;
				    lmin = l;
				}
				if(x==specific_site) fprintf(stderr,"[prev=%i curr=%i pos_p=%i pos_c=%i d=%i]\n",y,x,site_pos[l],site_pos[k],d);
			    }
			}
			m  = (lmin>=0 && (((double)dmin/a)<dthreshold || dmin<dlimit)) ? site_score[lmin] + a : 0;			
			if(m>site_score[k]) {
			    site_score[k] = m;
			    site_lbest[k] = lmin;
                            site_qbest[k] = q;
			}
			if(site_score[k]>score_max) {
			    score_max = site_score[k];
			    kbest = k; jbest = j;
			}
			if(x==specific_site) fprintf(stderr,"[curr=%i score=%i]\n",x,site_score[k]);
		    }
		}
	    }
	}
	j = jbest;
	k = kbest;
	if(k>=0 && site_score[k]>0) {
	    fprintf(outfile,"%s\t%i\t%i\t%s\t%i\t%i\t",get_chr_name(gene_chr[i]),gene_pos[j],gene_str[i],get_chr_name(site_chr[k]),site_pos[k],site_str[k]);
	    fprintf(outfile,"%i\t%i\t%c\t%i\t%i\n",i,gene_site[j],gene_styp[j],site_pos[site_lbest[k]],0);
	    while(site_score[k]>0 && site_lbest[k]>=0 && site_qbest[k]>=0) {
		kprev = k;
            	j = j - site_qbest[k];
                k = site_lbest[k];
 		fprintf(outfile,"%s\t%i\t%i\t%s\t%i\t%i\t",get_chr_name(gene_chr[i]),gene_pos[j],gene_str[i],get_chr_name(site_chr[k]),site_pos[k],site_str[k]);
		fprintf(outfile,"%i\t%i\t%c\t%i\t%i\n",i,gene_site[j],gene_styp[j],(site_lbest[k]>=0?site_pos[site_lbest[k]]:0),(kprev>=0?site_pos[kprev]:0));
	    }
 
	}
    }
    timestamp_report();
    exit(0);
}
示例#5
0
int main(int argc, char* argv[]) {
    char aln_file_name[MAXBUFFLENGTH];
    char out_file_name[MAXBUFFLENGTH]="";
 
    char buff[MAXBUFFLENGTH];
    char chr1[MAXBUFFLENGTH];
    char chr2[MAXBUFFLENGTH];

    double dthreshold = 1.50;
    int dlimit = 100000;
    int max_depth = 4;

    int** chr_t;
    int** str_t;
    int** pos_t;

    int* chr_q;
    int* str_q;
    int* pos_q;

    int **score;
    int **jbest;
    int **lbest;

    int a,b,d,dmin,lmin,s;
    char c,c1,c2;

    int *count;
    int *ptr;

    int max_rec=0;

    int pos1, pos2, str1, str2;
    int i, j, k, l, n;

    int s_max, k_max;

    if(argc==1) {
	fprintf(stderr,"This utility takes a non-unique mapping in cps3+cps3 format and does ad-hoc filtering of the projected coordinates by maximum synteny\n");
        fprintf(stderr,"Last update by Dmitri Pervouchine ([email protected]) on Mar 26, 2013\n");
        fprintf(stderr,"Usage: %s -in <aln_file> -out <output_file> [-maxdepth <int>] [-threshold <double>] [-lendiff <diff>] [-quiet]\n",argv[0]);
	fprintf(stderr," -in cps3+cps3 file, remember to sort by position in ascending order\n");
	fprintf(stderr," -out <output_file> [default=stdout]\n");
	fprintf(stderr," -maxdepth <integer> how many preceding positions can be skipped [default=%i]\n", max_depth);
	fprintf(stderr," -threshold <double> max change of segment length, in percent [default=%2.2lf]\n", dthreshold);
	fprintf(stderr," -lendiff <integer>, [default=%i]\n",dlimit);
	fprintf(stderr," -quiet suppress verbose output [default=NO]\n");
	fprintf(stderr,"Note: the mapping [x,x+dx] -> [y,y+dy] is OK if |dy-dx|/dx<threshold OR |dy-dx|<dlimit\n");
	exit(1);
    }

    timestamp_set();
    for(i=1;i<argc;i++) {
        if(strcmp(argv[i],"-in")==0) {
            sscanf(argv[++i], "%s", &aln_file_name[0]);
        }
        if(strcmp(argv[i],"-out")==0) {
            sscanf(argv[++i], "%s", &out_file_name[0]);
        }
        if(strcmp(argv[i],"-lendiff")==0) {
            sscanf(argv[++i], "%i", &dlimit);
        }
        if(strcmp(argv[i],"-threshold")==0) {
            sscanf(argv[++i], "%lf", &dthreshold);
        }
        if(strcmp(argv[i],"-maxdepth")==0) {
            sscanf(argv[++i], "%i", &max_depth);
        }
        if(strcmp(argv[i],"-quiet")==0) {
            verbose=0;
        }
    }

    if(out_file_name[0]==0) {
        fprintf(stderr,"[WARNING: output file not specified, redirect to stdout]\n");
        out_file = stdout;
    }
    else {
        out_file = fopen(out_file_name,"w");
        if(out_file == NULL) {
            fprintf(stderr,"[ERROR: output file (%s) cannot be opened, exiting]\n", out_file_name);
            exit(1);
        }
	if(verbose) fprintf(stderr,"[>%s]\n",out_file_name);
    }

/*******************************************************************************************************/
    aln_file= fopen(aln_file_name,"r");
    if(aln_file==NULL) {
        fprintf(stderr,"[ERROR: cannot access %s, exiting]\n", aln_file_name);
	exit(1);
    }

    if(verbose) fprintf(stderr,"[<%s, pass 1", aln_file_name);
    while(fgets(buff, MAXBUFFLENGTH, aln_file)) {
	if(strlen(buff)<2) break;
	max_rec++;
    }
    if(verbose) fprintf(stderr,"]\n");

    chr_q = (int*)malloc(sizeof(int)*(max_rec + ARRAY_MARGIN));
    str_q = (int*)malloc(sizeof(int)*(max_rec + ARRAY_MARGIN));
    pos_q = (int*)malloc(sizeof(int)*(max_rec + ARRAY_MARGIN));

    chr_t = (int**)malloc(sizeof(int*)*(max_rec + ARRAY_MARGIN));
    str_t = (int**)malloc(sizeof(int*)*(max_rec + ARRAY_MARGIN));
    pos_t = (int**)malloc(sizeof(int*)*(max_rec + ARRAY_MARGIN));

    score = (int**)malloc(sizeof(int*)*(max_rec + ARRAY_MARGIN));
    lbest = (int**)malloc(sizeof(int*)*(max_rec + ARRAY_MARGIN));
    jbest = (int**)malloc(sizeof(int*)*(max_rec + ARRAY_MARGIN));

    count = (int*)malloc(sizeof(int)*(max_rec + ARRAY_MARGIN));
    ptr   = (int*)malloc(sizeof(int)*(max_rec + ARRAY_MARGIN));

    if(chr_q == NULL || str_q == NULL || pos_q == NULL || count == NULL || ptr == NULL || chr_t == NULL || str_t == NULL || pos_t == NULL) {
	fprintf(stderr,"[ERROR: not enough memory, exiting]\n");
	exit(1);
    }

    for(i=0;i<max_rec;i++) count[i] = ptr[i] = 0;

    if(verbose) fprintf(stderr,"[<%s, pass 2", aln_file_name);
    fseek (aln_file, 0, SEEK_SET);
    n=0;
    while(fgets(buff, MAXBUFFLENGTH, aln_file)) {
        if(strlen(buff)<2) break;
        sscanf(buff,"%s %i %c" , &chr1[0], &pos1, &c);
	str1 = strand_c2i(c);
	if(assign_code(chr1) != chr_q[n] || str1 != str_q[n] || pos1 != pos_q[n]) {
	    n++;
	    chr_q[n] = assign_code(chr1);
            str_q[n] = str1;
            pos_q[n] = pos1;
	}
	count[n]++;
    }
    if(verbose) fprintf(stderr,"]\n");

    for(i=1; i<=n; i++) {
        chr_t[i] = (int*)malloc(sizeof(int)*(count[i] + ARRAY_MARGIN));
	pos_t[i] = (int*)malloc(sizeof(int)*(count[i] + ARRAY_MARGIN));
        str_t[i] = (int*)malloc(sizeof(int)*(count[i] + ARRAY_MARGIN));

	score[i] = (int*)malloc(sizeof(int)*(count[i] + ARRAY_MARGIN));
	lbest[i] = (int*)malloc(sizeof(int)*(count[i] + ARRAY_MARGIN));
        jbest[i] = (int*)malloc(sizeof(int)*(count[i] + ARRAY_MARGIN));

	if(chr_t[i] == NULL || str_t[i] == NULL || pos_t[i] == NULL) {
	    fprintf(stderr,"[ERROR: not enough memory, exiting]\n");
	    exit(1);
	}
    }

    if(verbose) fprintf(stderr,"[<%s, pass 3", aln_file_name);
    fseek (aln_file, 0, SEEK_SET);
    n=0;
    while(fgets(buff, MAXBUFFLENGTH, aln_file)) {
        if(strlen(buff)<2) break;
        sscanf(buff,"%s %i %c %s %i %c" , &chr1[0], &pos1, &c1, &chr2[0], &pos2, &c2);
	str1 = strand_c2i(c1);
	str2 = strand_c2i(c2);
	if(assign_code(chr1) != chr_q[n] || str1 != str_q[n] || pos1 != pos_q[n]) n++;
	chr_t[n][ptr[n]] = assign_code(chr2);
	pos_t[n][ptr[n]] = pos2;
	str_t[n][ptr[n]] = str2;
	ptr[n]++;
    }
    if(verbose) fprintf(stderr,"]\n");

    for(i=1; i<=n; i++) {
	progressbar(i, n, (char*)"Filtering ", verbose);
	for(k=0; k<count[i]; k++) {
	    score[i][k] = 0;
	    lbest[i][k] = -1;
	    jbest[i][k] = -1;
	    for(j=i-1; j>0 && i-j<=max_depth; j--) {
		a = abs(pos_q[i] - pos_q[j]);
                dmin = INFTY;
                lmin = -1;
		for(l=0; l<count[j]; l++) {
		    if(chr_t[i][k] == chr_t[j][l] && str_t[i][k]*str_t[j][l] == str_q[i]*str_q[j]) {
			b = abs(pos_t[i][k] - pos_t[j][l]);
                        d = abs(b-a);
                        if(d<dmin) {
                            dmin = d;
                            lmin = l;
                        }
                    }
                }
		s = (lmin>=0 && (((double)dmin/a) < dthreshold || dmin < dlimit)) ? score[j][lmin] + a : 0;
		if(s > score[i][k]) {
		    score[i][k] = s;
		    jbest[i][k] = j;
		    lbest[i][k] = lmin;
		} 
	    }
	}
    }
    
    for(i=n;i>0;i--) {
	s_max = 0;
	k_max = -1;
	for(k=0; k<count[i]; k++) {
	    if(score[i][k]>s_max) {
		s_max = score[i][k];
		k_max = k;
	    }
	}
	if(k_max>=0) {
	    k = k_max; 
	    while(jbest[i][k]>=0 && lbest[i][k]>=0) {
		fprintf(out_file,"%s\t%i\t%c\t", get_chr_name(chr_q[i]), pos_q[i], strand_i2c(str_q[i]));
		fprintf(out_file,"%s\t%i\t%c\n", get_chr_name(chr_t[i][k]), pos_t[i][k], strand_i2c(str_t[i][k]));
		j = jbest[i][k];
		l = lbest[i][k];
		i = j;
		k = l;
	    }
	}

    }
    timestamp_report();
    exit(0);
}