int main(int argc, char* argv[]) { char cpsfilename[MAXBUFFLENGTH]; char chainfilename[MAXBUFFLENGTH]; char outfilename[MAXBUFFLENGTH]=""; int marginlength = 0; int MAXREC; char c; char *pc; int x; char buff[MAXBUFFLENGTH+1]; char aux[MAXBUFFLENGTH+1]; long score; int start1,end1,len1,start2,end2,len2; char strand1, strand2, chr1[MAXBUFFLENGTH], chr2[MAXBUFFLENGTH]; int *size, *dq, *dt; int a,b,k,i,j,s,m; int *position; char *strand; int *ids; int *idg; char *type; int chridx[MAXCHR+1]; int chroff[MAXCHR+1]; char resstr; int rescrd; if(argc==1) { fprintf(stderr,"Finds matches of the given set of sites (CPS file) in the BLASTZ chain alignment (CHAIN file)\n"); fprintf(stderr,"Last update by (dp) on Sep 21, 2011\n"); fprintf(stderr,"Keys:\n -i CPS file (remember to sort by position in ascending order)\n -d CHAIN alignment file\n -o output file\n"); fprintf(stderr," -m margin length [0]\n -v suppress verbose output [NO]\n"); exit(1); } timestamp_set(); for(i=1;i<argc;i++) { pc = argv[i]; if(*pc != '-') continue; if(*(pc+1) == 'i') { sscanf(argv[++i], "%s", &cpsfilename[0]); } if(*(pc+1) == 'd') { sscanf(argv[++i], "%s", &chainfilename[0]); } if(*(pc+1) == 'o') { sscanf(argv[++i], "%s", &outfilename[0]); } if(*(pc+1) == 'm') { sscanf(argv[++i], "%i", &marginlength); } if(*(pc+1) == 'v') { verbose=0; } } if(outfilename[0]==0) { fprintf(stderr,"No output file privided, exiting\n"); exit(1); } outfile = fopen(outfilename,"w"); if(outfile == NULL) { fprintf(stderr,"Can't open output file, exiting\n"); exit(1); } for(i=0;i<MAXCHR;i++) { chridx[i]=chroff[i]=0; } MAXREC = 0; cpsfile= fopen(cpsfilename,"r"); if(cpsfile==NULL) { fprintf(stderr,"Can't access CPS file. Exiting\n"); exit(1); } if(verbose) fprintf(stderr,"Reading CPS input pass 1"); while(!feof(cpsfile)) { buff[0]=0; fgets(buff,MAXBUFFLENGTH,cpsfile); if(strlen(buff)<2) break; sscanf(buff,"%s" , aux); chridx[assign_code(aux)]++; MAXREC++; } fclose(cpsfile); for(s=i=0;i<MAXCHR;i++) { x = chridx[i]; chridx[i] =s; s+=x; } chridx[i] = s; position = (int*) malloc(sizeof(int)*(s+4)); strand = (char*) malloc(sizeof(char)*(s+4)); type = (char*) malloc(sizeof(char)*(s+4)); ids = (int*) malloc(sizeof(int)*(s+4)); idg = (int*) malloc(sizeof(int)*(s+4)); if(position==NULL || strand==NULL || type==NULL || ids==NULL || idg==NULL) { fprintf(stderr,"Not enough memory. Terminated\n"); exit(1); } cpsfile= fopen(cpsfilename,"r"); if(verbose) fprintf(stderr,", records = %i\nReading CPS input pass 2",MAXREC); while(!feof(cpsfile)) { buff[0]=0; fgets(buff,MAXBUFFLENGTH,cpsfile); if(strlen(buff)<2) break; sscanf(buff,"%s" , aux); i = assign_code(aux); m = chridx[i]+chroff[i]; sscanf(buff,"%*s %i %c %i %i %c" , position+m, strand+m, idg+m, ids+m, type+m); chroff[i]++; } fclose(cpsfile); if(verbose) fprintf(stderr,"\nSorting segments"); /* for(i=0;i<MAXCHR;i++) { quickSort_ic(position,strand,chridx[i],chridx[i+1]-1); } */ for(i=0;i<MAXCHR;i++) { k=1; while(k) { k=0; for(j=chridx[i];j<chridx[i+1]-1;j++) { if(position[j]>position[j+1]) { k=1; swapi(position+j,position+j+1); swapc(strand+j,strand+j+1); swapc(type+j,type+j+1); swapi(ids+j,ids+j+1); swapi(idg+j,idg+j+1); } } } } if(verbose) fprintf(stderr," done\nProcessing chains"); /**********************************************************************************************/ size = (int*) malloc(sizeof(int)*MAXALN); dq = (int*) malloc(sizeof(int)*MAXALN); dt = (int*) malloc(sizeof(int)*MAXALN); if(size ==0 || dq ==0 || dt==0) { fprintf(stderr,"Not enough memory for such long chains. Terminated\n"); exit(1); } /**********************************************************************************************/ chainfile = fopen(chainfilename,"r"); while(!feof(chainfile)) { buff[0]=0; fgets(buff,MAXBUFFLENGTH,chainfile); if(strlen(buff)<2) break; buff[5]=0; if(strcmp(buff,"chain")==0) { sscanf(buff+6,"%li %s %i %c %i %i %s %i %c %i %i",&score, &chr1[0], &len1, &strand1, &start1, &end1, &chr2[0], &len2, &strand2, &start2, &end2); k=0; while(!feof(chainfile)) { buff[0]=0; fgets(buff,MAXBUFFLENGTH,chainfile); if(strlen(buff)<2) break; sscanf(buff,"%i %i %i",&size[k],&dt[k],&dq[k]); k++; if(k>MAXALN) { fprintf(stderr,"Chain length exceeded. Terminating"); exit(1); } } x = get_chr_code(chr1); if(x<0) continue; a=start1;b=start2; j=0; for(i=chridx[x];i<chridx[x+1] && position[i]<start1;i++); for(;i<chridx[x+1]&& position[i]<end1;i++) { while(position[i]>a+size[j]+dt[j] && j<k){ a+=size[j]+dt[j]; b+=size[j]+dq[j]; j++; } if(j>=k) break; if(position[i]-a > marginlength && a+size[j]-position[i] >= marginlength) { if(strand1==strand2) { resstr = strand[i]; rescrd = position[i] - a + b; } else { resstr = (strand[i]=='+') ? '-' : '+'; rescrd = len2 - (position[i] - a + b - 1) ; } fprintf(outfile,"%s\t%i\t%c\t%s\t%i\t%c\t%i\t%i\t%c\t%li\n",chr1, position[i], strand[i],chr2,rescrd, resstr, idg[i], ids[i],type[i],score); } } } } if(verbose) fprintf(stderr," done\n"); fclose(chainfile); fclose(outfile); timestamp_report(); exit(0); }
int main(int argc, char* argv[]) { char out_file_name[MAXBUFFLENGTH]=""; char input_file_name[MAXBUFFLENGTH]=""; char idx_file_name[MAXBUFFLENGTH]; char dbx_file_name[MAXBUFFLENGTH]; long offset; long seqlen; long intronic_window = 150; long exonic_window = 0; char filename[MAXBUFFLENGTH]; char name[MAXBUFFLENGTH]; char chr_name[MAXBUFFLENGTH]; char buff[MAXBUFFLENGTH]; char longbuff[MAXLONGBUFFLENGTH]; char longbuffm[MAXLONGBUFFLENGTH]; FILE *idx_file; FILE *dbx_file; FILE *input_file; FILE *outfile; int b,i,j,q,k,n,a,m,s; char c; long position,y; int strand; long** pos; int** str; char** typ; char*** ids; long p,l; int record_count[MAXCHR]; int record_idx[MAXCHR]; int cis = 0; int coord = 0; int all=0; int warnings = 0; char format[][64] = {"%*s %*i %*i %s %li %i %*s %s %c", "%s %li %i %*s %s %c"}; if(argc==1) { fprintf(stderr,"This routine get sequence segments from a custom compressed FASTA repository (see transf)\n"); fprintf(stderr,"Last update by Dmitri Pervouchine ([email protected]) on Mar 22, 2013\n"); fprintf(stderr," -in <aln_file>\n -dbx <database_file>\n -idx <index_file>\n -out <output_file>\n"); fprintf(stderr," -we <exonic_window> [default=%i]\n -wi <intronic_window> [default=%i]\n -cis [use colunms 1-3] [default=%i]\n", exonic_window, intronic_window, cis); fprintf(stderr," -quiet <suppress verbose output> [default=no]\n -all <include all sites>\n -coord <offset for 3'-sites> [default=%i]\n",coord); exit(1); } timestamp_set(); for(i=1;i<argc;i++) { if(strcmp(argv[i],"-in")==0) { sscanf(argv[++i], "%s", &input_file_name[0]); } if(strcmp(argv[i],"-dbx")==0) { sscanf(argv[++i], "%s", &dbx_file_name[0]); } if(strcmp(argv[i],"-idx")==0) { sscanf(argv[++i], "%s", &idx_file_name[0]); } if(strcmp(argv[i],"-out")==0) { sscanf(argv[++i], "%s", &out_file_name[0]); } if(strcmp(argv[i],"-we")==0) { sscanf(argv[++i], "%li", &exonic_window); } if(strcmp(argv[i],"-wi")==0) { sscanf(argv[++i], "%li", &intronic_window); } if(strcmp(argv[i],"-coord")==0) { sscanf(argv[++i], "%i", &coord); } if(strcmp(argv[i],"-quiet")==0) { verbose = 0; } if(strcmp(argv[i],"-cis")==0) { cis = 1; } if(strcmp(argv[i],"-all")==0) { all = 1; } } if(out_file_name[0]==0) { fprintf(stderr,"[WARNING: output file not specified, redirect to stdout]\n"); outfile = stdout; } else { outfile = fopen(out_file_name,"w"); if(outfile == NULL) { fprintf(stderr,"[ERROR: output file %s cannot be opened for writing, exiting]\n", out_file_name); exit(1); } if(verbose) fprintf(stderr,"[>%s]\n",out_file_name); } input_file = fopen(input_file_name,"r"); if(input_file == NULL) { fprintf(stderr,"[ERROR: cannot access %s, exiting]\n", input_file_name); exit(1); } if(verbose) fprintf(stderr,"[<%s, pass 1",input_file_name); while(fgets(buff,MAXBUFFLENGTH,input_file)) { if(strlen(buff)<2) break; sscanf(buff, format[cis], &chr_name[0], &position, &strand, &name[0], &c); n = assign_code(chr_name); record_count[n]++; } if(verbose) fprintf(stderr,"]\n"); pos = (long**) malloc(sizeof(long*)*(N_CHR_NAMES+1)); str = (int**) malloc(sizeof(int*)*(N_CHR_NAMES+1)); typ = (char**) malloc(sizeof(char*)*(N_CHR_NAMES+1)); ids = (char***)malloc(sizeof(char**)*(N_CHR_NAMES+1)); if(pos==NULL || str==NULL || typ==NULL || ids==NULL) { fprintf(stderr,"[ERROR: failed to create index tables, exiting]\n"); exit(1); } for(i=0;i<N_CHR_NAMES;i++) { if(record_count[i]>0) { pos[i] = (long*) malloc(sizeof(long)*(record_count[i]+1)); str[i] = (int*) malloc(sizeof(int)*(record_count[i]+1)); typ[i] = (char*) malloc(sizeof(char)*(record_count[i]+1)); ids[i] = (char**) malloc(sizeof(char*)*(record_count[i]+1)); if(pos[i]==NULL || str[i]==NULL || typ[i]==NULL || ids[i]==NULL) { fprintf(stderr,"[ERROR: failed to create index tables, exiting]\n"); exit(1); } record_idx[i]=0; } } if(verbose) fprintf(stderr,"[<%s, pass 2",input_file_name); fseek(input_file, 0, SEEK_SET); while(fgets(buff,MAXBUFFLENGTH,input_file)) { if(strlen(buff)<2) break; sscanf(buff, format[cis], &chr_name[0], &position, &strand, &name[0], &c); i = get_chr_code(chr_name); j = record_idx[i]; pos[i][j] = position + (c=='D' ? coord*strand : 0); str[i][j] = strand; typ[i][j] = c; ids[i][j] = (char*) malloc(sizeof(char)*(strlen(name)+1)); strcpy(ids[i][j],name); record_idx[i]++; } if(verbose) fprintf(stderr,"]\n"); if(verbose) fprintf(stderr,"[<%s,%s",idx_file_name,dbx_file_name); idx_file = fopen(idx_file_name,"r"); dbx_file = fopen(dbx_file_name,"r"); if(idx_file == NULL || dbx_file == NULL) { fprintf(stderr,"[ERROR: cannot access %s or %s, exiting]\n", idx_file_name, dbx_file_name); exit(1); } offset = 0; while(fgets(buff,MAXBUFFLENGTH,idx_file)) { if(strlen(buff)<2) break; sscanf(buff,"%s" , &name[0]); while(fgets(buff,MAXBUFFLENGTH,idx_file)) { if(strlen(buff)<2) break; sscanf(buff,"%s %li" , &chr_name[0], &seqlen); i = get_chr_code(chr_name); for(k=0;k<record_count[i];k++) { if(pos[i][k]>seqlen) { warnings++; continue; } l = exonic_window + intronic_window; if(typ[i][k]=='D' || typ[i][k]=='A' || all) { if(str[i][k]>0) { p = pos[i][k] - 1 - (typ[i][k]=='D' ? exonic_window : intronic_window); } else { p = pos[i][k] - (typ[i][k]=='D' ? intronic_window : exonic_window); } fget_segment(longbuff, dbx_file, offset, p, l); if(str[i][k]<0) { rev1(longbuff); } if(is_all_n(longbuff)) continue; fprintf(outfile,"%s\t%s\t%li\t%li\t%i\t%li\t%s\t%c\n",ids[i][k], chr_name, (str[i][k]>0 ? p + 1 : seqlen - (p + l)), l, str[i][k], seqlen, longbuff,typ[i][k]); } } offset+= (seqlen % 8 == 0) ? seqlen/8 : (seqlen/8 + 1); } } fclose(outfile); fclose(idx_file); fclose(dbx_file); if(verbose) fprintf(stderr,"]\n"); if(verbose && warnings>0) fprintf(stderr,"[WARNING: %i windows were out of range, they were ignored]\n", warnings); timestamp_report(); exit(0); }
int main(int argc, char* argv[]) { char cps_file_name[MAXBUFFLENGTH]; char chain_file_name[MAXBUFFLENGTH]; char out_file_name[MAXBUFFLENGTH]=""; int marginlength = 0; char buff[MAXBUFFLENGTH+1]; char aux[MAXBUFFLENGTH+1]; long score; int start1,end1,len1,start2,end2,len2; char strand1, strand2, chr1[MAXBUFFLENGTH], chr2[MAXBUFFLENGTH]; int *size, *dq, *dt; int a,b,k,i,j,s,m,x; int *position; int *strand; int *idg; int *ids; char *type; char c; int chridx[MAXCHR+1]; int chroff[MAXCHR+1]; char resstr; int rescrd; long chain_id; if(argc==1) { fprintf(stderr,"This utility does liftOver of coordinates (cps) by using chain alignment\n"); fprintf(stderr,"Gene information is included in the output\n"); fprintf(stderr,"Last update by Dmitri Pervouchine ([email protected]) on Mar 22, 2013\n"); fprintf(stderr,"Usage: %s -in <cps_file> -chain <chain_alignment_file> [-margin <length>] [-quiet]\n", argv[0]); fprintf(stderr," -in cps6, i.e. chr1/position1/strand1/gene/site/type tab-delimited file, strand is +/-\n"); fprintf(stderr," -chain UCSC chain alignment file, species1=>2\n"); fprintf(stderr," -out <output_file> [default=stdout]\n"); fprintf(stderr," -margin margin length [default=0]\n -quiet suppress verbose output [default=NO]\n"); fprintf(stderr,"NOTE: Input has to be sorted by position!\n"); fprintf(stderr,"Output format cps3+cps6: chr1/position1/strand1/chr2/position2/strand2/gene/site/type/score\n"); exit(1); } timestamp_set(); for(i=1;i<argc;i++) { if(strcmp(argv[i],"-in")==0) { sscanf(argv[++i], "%s", &cps_file_name[0]); } if(strcmp(argv[i],"-chain")==0) { sscanf(argv[++i], "%s", &chain_file_name[0]); } if(strcmp(argv[i],"-out")==0) { sscanf(argv[++i], "%s", &out_file_name[0]); } if(strcmp(argv[i],"-margin")==0) { sscanf(argv[++i], "%i", &marginlength); } if(strcmp(argv[i],"-quiet")==0) { verbose=0; } } if(out_file_name[0]==0) { fprintf(stderr,"[WARNING: output file not specified, redirect to stdout]\n"); out_file = stdout; } else { out_file = fopen(out_file_name,"w"); if(out_file == NULL) { fprintf(stderr,"[ERROR: output file %s cannot be opened for writing, exiting]\n", out_file_name); exit(1); } if(verbose) fprintf(stderr,"[>%s]\n",out_file_name); } cps_file= fopen(cps_file_name,"r"); if(cps_file==NULL) { fprintf(stderr,"[ERROR: cannot access %s, exiting]\n", cps_file_name); exit(1); } for(i=0;i<MAXCHR;i++) chridx[i] = chroff[i] = 0; if(verbose) fprintf(stderr,"[<%s, pass 1",cps_file_name); while(fgets(buff,MAXBUFFLENGTH,cps_file)) { if(strlen(buff)<2) break; sscanf(buff,"%s" , aux); chridx[assign_code(aux)]++; } if(verbose) fprintf(stderr,"]\n"); for(s=i=0;i<MAXCHR;i++) { x = chridx[i]; chridx[i] =s; s+=x; } chridx[i] = s; position = (int*) malloc(sizeof(int)*(s + ARRAY_MARGIN)); strand = (int*) malloc(sizeof(int)*(s + ARRAY_MARGIN)); ids = (int*) malloc(sizeof(int)*(s + ARRAY_MARGIN)); idg = (int*) malloc(sizeof(int)*(s + ARRAY_MARGIN)); type = (char*) malloc(sizeof(char)*(s + ARRAY_MARGIN)); if(position==NULL || strand==NULL || type==NULL || ids==NULL || idg==NULL) { fprintf(stderr,"[ERROR: failed to create index tables, exiting]\n"); exit(1); } fseek (cps_file, 0, SEEK_SET); if(verbose) fprintf(stderr,"[<%s, pass 2", cps_file_name); while(fgets(buff,MAXBUFFLENGTH,cps_file)) { if(strlen(buff)<2) break; sscanf(buff,"%s" , aux); i = assign_code(aux); m = chridx[i]+chroff[i]; sscanf(buff,"%*s %i %c %i %i %c" , &position[m], &c,&idg[m],&ids[m],&type[m]); strand[m] = strand_c2i(c); chroff[i]++; } fclose(cps_file); if(verbose) fprintf(stderr,"]\n"); if(verbose) fprintf(stderr,"[Sort by position (if not done before)"); for(i=0;i<MAXCHR;i++) { k=1; while(k) { k=0; for(j=chridx[i];j<chridx[i+1]-1;j++) { if(position[j]>position[j+1]) { k=1; swapi(position+j,position+j+1); swapi(strand+j,strand+j+1); swapi(idg+j,idg+j+1); swapi(ids+j,ids+j+1); swapc(type+j,type+j+1); } } } } if(verbose) fprintf(stderr,"]\n"); /**********************************************************************************************/ size = (int*) malloc(sizeof(int)*(MAXALN + ARRAY_MARGIN)); dq = (int*) malloc(sizeof(int)*(MAXALN + ARRAY_MARGIN)); dt = (int*) malloc(sizeof(int)*(MAXALN + ARRAY_MARGIN)); if(size ==0 || dq ==0 || dt==0) { fprintf(stderr,"[ERROR: not enough memory for chains, exiting]\n"); exit(1); } /**********************************************************************************************/ chain_file = fopen(chain_file_name,"r"); if(chain_file==NULL) { fprintf(stderr,"[ERROR: cannot access %s, exiting]\n", chain_file_name); exit(1); } fseek(chain_file, 0, SEEK_END); unsigned int last_pos = ftell(chain_file); fseek(chain_file, 0, SEEK_SET); while(fgets(buff,MAXBUFFLENGTH,chain_file)) { if(strlen(buff)<2) break; buff[5]=0; if(strcmp(buff,"chain")==0) { sscanf(buff+6,"%li %s %i %c %i %i %s %i %c %i %i %li",&score, &chr1[0], &len1, &strand1, &start1, &end1, &chr2[0], &len2, &strand2, &start2, &end2, &chain_id); k=0; while(fgets(buff,MAXBUFFLENGTH,chain_file)) { if(strlen(buff)<2) break; progressbar(ftell(chain_file), last_pos-1, (char*)"Processing ", verbose); sscanf(buff,"%i %i %i",size + k, dt + k, dq + k); k++; if(k>=MAXALN) { fprintf(stderr,"[ERROR: chain too long, exiting]\n"); exit(1); } } x = get_chr_code(chr1); if(x<0) continue; a=start1;b=start2; j=0; for(i=chridx[x];i<chridx[x+1] && position[i]<start1;i++); for(;i<chridx[x+1]&& position[i]<end1;i++) { while(position[i]>a+size[j]+dt[j] && j<k){ a+=size[j]+dt[j]; b+=size[j]+dq[j]; j++; } if(j>=k) break; if(position[i]-a > marginlength && a+size[j]-position[i] >= marginlength) { if(strand1==strand2) { resstr = strand[i]; rescrd = position[i] - a + b; } else { resstr = -strand[i]; rescrd = len2 - (position[i] - a + b - 1) ; } fprintf(out_file,"%s\t%i\t%c\t%s\t%i\t%c\t%i\t%i\t%c\t%li\n",chr1, position[i], strand_i2c(strand[i]), chr2, rescrd, strand_i2c(resstr), idg[i], ids[i],type[i],score); } } } } fclose(chain_file); fclose(out_file); timestamp_report(); free(size); free(dq); free(dt); free(position); free(strand); exit(0); }
int main(int argc, char* argv[]) { char alnfilename[MAXBUFFLENGTH]; char cpsfilename[MAXBUFFLENGTH]; char outfilename[MAXBUFFLENGTH]=""; char c; char buff[MAXBUFFLENGTH]; char aux[MAXBUFFLENGTH]; int start1,end1,len1,start2,end2,len2; char strand1, strand2; int a,b,k,l,i,j,s,m,q; int d,dmin,lmin,score_max; int x,y; int qbest, kbest, jbest; int kprev; int *gene_idx; //index int *gene_off; //offset int *gene_site; //site number char *gene_styp; //site type int *gene_pos; //site position int *gene_chr; //chromosome char *gene_str; //strand int max_genes; int max_sites; int *site_idx; //index int *site_off; //offset int *site_chr; //matching chromosome int *site_pos; //matching pos int *site_str; //matching strand int *site_score; //--- optimal score int *site_lbest; //--- where it came from int *site_qbest; //--- where it came from int specific_site=0; double dthreshold = 0.50; int dlimit = 5000; int max_depth = 4; int pos, strand; if(argc==1) { fprintf(stderr,"Select best unique maping from the ALN file created by map_single\n"); fprintf(stderr,"Last updated by Dmitri Pervouchine ([email protected]) on Jan 28, 2013\n"); fprintf(stderr,"Keys:\n -in <cps file>\n -aln <aln file>\n -out <output file>\n"); //fprintf(stderr," -l length difference limit [%i]\n -t percentage difference threshold [%1.2lf] (ONE OR THE OTHER THRESHOLD IS USED)\n -h max_depth [%i]\n",dlimit, dthreshold,max_depth); //fprintf(stderr," -v suppress verbose output [NO]\n"); exit(1); } timestamp_set(); for(i=1;i<argc;i++) { if(strcmp(argv[i],"-in")==0) { sscanf(argv[++i], "%s", &cpsfilename[0]); } if(strcmp(argv[i],"-aln")==0) { sscanf(argv[++i], "%s", &alnfilename[0]); } if(strcmp(argv[i],"-out")==0) { sscanf(argv[++i], "%s", &outfilename[0]); } if(strcmp(argv[i],"-lendiff")==0) { sscanf(argv[++i], "%i", &dlimit); } if(strcmp(argv[i],"-threshold")==0) { sscanf(argv[++i], "%lf", &dthreshold); } if(strcmp(argv[i],"-maxdepth")==0) { sscanf(argv[++i], "%i", &max_depth); } if(strcmp(argv[i],"-quiet")==0) { verbose=0; } if(strcmp(argv[i],"-s")==0) { sscanf(argv[++i], "%i", &specific_site); } } if(outfilename[0]==0) { fprintf(stderr,"[WARNING: output file not specified, redirect to stdout]\n"); outfile = stdout; } else { outfile = fopen(outfilename,"w"); if(outfile == NULL) { fprintf(stderr,"[ERROR: output file (%s) cannot be opened, exiting]\n", outfilename); exit(1); } } /*******************************************************************************************************/ cpsfile= fopen(cpsfilename,"r"); if(cpsfile==NULL) { fprintf(stderr,"Can't access CPS file. Exiting\n"); exit(1); } if(verbose) fprintf(stderr,"[Reading CPS, pass 0"); max_sites = max_genes = 0; while(fgets(buff,MAXBUFFLENGTH,cpsfile)) { if(strlen(buff)<2) break; sscanf(buff,"%*s %*i %*i %i %i", &i, &j); if(i>max_genes) max_genes = i; if(j>max_sites) max_sites = j; } max_genes++; max_sites++; gene_idx = (int*) malloc(sizeof(int)*(max_genes+1)); gene_off = (int*) malloc(sizeof(int)*(max_genes+1)); gene_chr = (int*) malloc(sizeof(int)*(max_genes+1)); gene_str = (char*) malloc(sizeof(char)*(max_genes+1)); for(i=0;i<max_genes;i++) gene_idx[i]=gene_off[i]=0; if(verbose) fprintf(stderr,", max_genes = %i, max_sites = %i]\n", max_genes, max_sites); if(verbose) fprintf(stderr,"[Reading CPS, pass 1"); fseek (cpsfile, 0, SEEK_SET); while(fgets(buff,MAXBUFFLENGTH,cpsfile)) { if(strlen(buff)<2) break; sscanf(buff,"%s %*i %i %i %*i" , aux, &strand, &i); gene_idx[i]++; gene_chr[i] = assign_code(aux); gene_str[i] = strand; } for(s=i=0;i<max_genes;i++) { x = gene_idx[i]; gene_idx[i] =s; s+=x; } gene_idx[i] = s; gene_site = (int*) malloc(sizeof(int)*(s+1)); gene_styp = (char*) malloc(sizeof(char)*(s+1)); gene_pos = (int*) malloc(sizeof(int)*(s+1)); if(verbose) fprintf(stderr,", records = %i]\n", s); if(verbose) fprintf(stderr,"[Reading CPS, pass 2"); fseek (cpsfile, 0, SEEK_SET); while(fgets(buff,MAXBUFFLENGTH,cpsfile)) { if(strlen(buff)<2) break; sscanf(buff,"%*s %i %*i %i %i %c" , &x, &i, &j, &c); gene_site[gene_idx[i]+gene_off[i]]=j; gene_styp[gene_idx[i]+gene_off[i]]=c; gene_pos[gene_idx[i]+gene_off[i]]=x; gene_off[i]++; } fclose(cpsfile); if(verbose) fprintf(stderr,"]\n"); /**********************************************************************************************/ alnfile = fopen(alnfilename,"r"); if(alnfile == NULL) { fprintf(stderr, "Cant open alignment file, exiting\n"); exit(1); } site_idx = (int*) malloc(sizeof(int)*(max_sites+1)); site_off = (int*) malloc(sizeof(int)*(max_sites+1)); for(i=0;i<max_sites;i++) { site_idx[i]=site_off[i]=0; } if(verbose) fprintf(stderr,"[Reading alignment file, pass 1"); while(fgets(buff,MAXBUFFLENGTH,alnfile)) { if(strlen(buff)<2) break; sscanf(buff,"%*s %*i %*i %*s %*i %*i %*i %i %*c" , &i); site_idx[i]++; } for(s=i=0;i<max_sites;i++) { x = site_idx[i]; site_idx[i] =s; s+=x; } site_idx[i] = s; site_chr = (int*) malloc(sizeof(int)*(s+1)); site_pos = (int*) malloc(sizeof(int)*(s+1)); site_str = (int*) malloc(sizeof(int)*(s+1)); site_score = (int*) malloc(sizeof(int)*(s+1)); site_lbest = (int*) malloc(sizeof(int)*(s+1)); site_qbest = (int*) malloc(sizeof(int)*(s+1)); if(verbose) fprintf(stderr,", records = %i]\n",s); if(verbose) fprintf(stderr,"[Reading alignment file, pass 2"); fseek (alnfile, 0, SEEK_SET); while(fgets(buff,MAXBUFFLENGTH,alnfile)) { if(strlen(buff)<2) break; sscanf(buff,"%*s %*i %*i %s %i %i %*i %i %*c" , &aux, &pos, &strand, &i); site_chr[site_idx[i]+site_off[i]] = assign_code(aux); site_pos[site_idx[i]+site_off[i]] = pos; site_str[site_idx[i]+site_off[i]] = strand; site_score[site_idx[i]+site_off[i]] = 0; site_lbest[site_idx[i]+site_off[i]] = -1; site_qbest[site_idx[i]+site_off[i]] = -1; site_off[i]++; } fclose(alnfile); if(verbose) fprintf(stderr,"]\n"); for(i=0;i<max_genes;i++) { progressbar(i,max_genes-1, (char*)"Processing"); score_max=0; kbest = -1; for(j=gene_idx[i];j<gene_idx[i+1];j++) { x = gene_site[j]; for(k=site_idx[x];k<site_idx[x+1];k++) { site_score[k] = 0; site_lbest[k] = site_qbest[k] = -1; } for(q=1;q<=max_depth;q++) { if(j-q>=gene_idx[i]) { y = gene_site[j-q]; a = abs(gene_pos[j]-gene_pos[j-q]); for(k=site_idx[x];k<site_idx[x+1];k++) { dmin = INFTY; lmin = -1; for(l=site_idx[y];l<site_idx[y+1];l++) { if(site_chr[k] == site_chr[l] && site_str[k] == site_str[l]) { b = abs(site_pos[k]-site_pos[l]); d = abs(b-a); if(d<dmin) { dmin = d; lmin = l; } if(x==specific_site) fprintf(stderr,"[prev=%i curr=%i pos_p=%i pos_c=%i d=%i]\n",y,x,site_pos[l],site_pos[k],d); } } m = (lmin>=0 && (((double)dmin/a)<dthreshold || dmin<dlimit)) ? site_score[lmin] + a : 0; if(m>site_score[k]) { site_score[k] = m; site_lbest[k] = lmin; site_qbest[k] = q; } if(site_score[k]>score_max) { score_max = site_score[k]; kbest = k; jbest = j; } if(x==specific_site) fprintf(stderr,"[curr=%i score=%i]\n",x,site_score[k]); } } } } j = jbest; k = kbest; if(k>=0 && site_score[k]>0) { fprintf(outfile,"%s\t%i\t%i\t%s\t%i\t%i\t",get_chr_name(gene_chr[i]),gene_pos[j],gene_str[i],get_chr_name(site_chr[k]),site_pos[k],site_str[k]); fprintf(outfile,"%i\t%i\t%c\t%i\t%i\n",i,gene_site[j],gene_styp[j],site_pos[site_lbest[k]],0); while(site_score[k]>0 && site_lbest[k]>=0 && site_qbest[k]>=0) { kprev = k; j = j - site_qbest[k]; k = site_lbest[k]; fprintf(outfile,"%s\t%i\t%i\t%s\t%i\t%i\t",get_chr_name(gene_chr[i]),gene_pos[j],gene_str[i],get_chr_name(site_chr[k]),site_pos[k],site_str[k]); fprintf(outfile,"%i\t%i\t%c\t%i\t%i\n",i,gene_site[j],gene_styp[j],(site_lbest[k]>=0?site_pos[site_lbest[k]]:0),(kprev>=0?site_pos[kprev]:0)); } } } timestamp_report(); exit(0); }
int main(int argc, char* argv[]) { char aln_file_name[MAXBUFFLENGTH]; char out_file_name[MAXBUFFLENGTH]=""; char buff[MAXBUFFLENGTH]; char chr1[MAXBUFFLENGTH]; char chr2[MAXBUFFLENGTH]; double dthreshold = 1.50; int dlimit = 100000; int max_depth = 4; int** chr_t; int** str_t; int** pos_t; int* chr_q; int* str_q; int* pos_q; int **score; int **jbest; int **lbest; int a,b,d,dmin,lmin,s; char c,c1,c2; int *count; int *ptr; int max_rec=0; int pos1, pos2, str1, str2; int i, j, k, l, n; int s_max, k_max; if(argc==1) { fprintf(stderr,"This utility takes a non-unique mapping in cps3+cps3 format and does ad-hoc filtering of the projected coordinates by maximum synteny\n"); fprintf(stderr,"Last update by Dmitri Pervouchine ([email protected]) on Mar 26, 2013\n"); fprintf(stderr,"Usage: %s -in <aln_file> -out <output_file> [-maxdepth <int>] [-threshold <double>] [-lendiff <diff>] [-quiet]\n",argv[0]); fprintf(stderr," -in cps3+cps3 file, remember to sort by position in ascending order\n"); fprintf(stderr," -out <output_file> [default=stdout]\n"); fprintf(stderr," -maxdepth <integer> how many preceding positions can be skipped [default=%i]\n", max_depth); fprintf(stderr," -threshold <double> max change of segment length, in percent [default=%2.2lf]\n", dthreshold); fprintf(stderr," -lendiff <integer>, [default=%i]\n",dlimit); fprintf(stderr," -quiet suppress verbose output [default=NO]\n"); fprintf(stderr,"Note: the mapping [x,x+dx] -> [y,y+dy] is OK if |dy-dx|/dx<threshold OR |dy-dx|<dlimit\n"); exit(1); } timestamp_set(); for(i=1;i<argc;i++) { if(strcmp(argv[i],"-in")==0) { sscanf(argv[++i], "%s", &aln_file_name[0]); } if(strcmp(argv[i],"-out")==0) { sscanf(argv[++i], "%s", &out_file_name[0]); } if(strcmp(argv[i],"-lendiff")==0) { sscanf(argv[++i], "%i", &dlimit); } if(strcmp(argv[i],"-threshold")==0) { sscanf(argv[++i], "%lf", &dthreshold); } if(strcmp(argv[i],"-maxdepth")==0) { sscanf(argv[++i], "%i", &max_depth); } if(strcmp(argv[i],"-quiet")==0) { verbose=0; } } if(out_file_name[0]==0) { fprintf(stderr,"[WARNING: output file not specified, redirect to stdout]\n"); out_file = stdout; } else { out_file = fopen(out_file_name,"w"); if(out_file == NULL) { fprintf(stderr,"[ERROR: output file (%s) cannot be opened, exiting]\n", out_file_name); exit(1); } if(verbose) fprintf(stderr,"[>%s]\n",out_file_name); } /*******************************************************************************************************/ aln_file= fopen(aln_file_name,"r"); if(aln_file==NULL) { fprintf(stderr,"[ERROR: cannot access %s, exiting]\n", aln_file_name); exit(1); } if(verbose) fprintf(stderr,"[<%s, pass 1", aln_file_name); while(fgets(buff, MAXBUFFLENGTH, aln_file)) { if(strlen(buff)<2) break; max_rec++; } if(verbose) fprintf(stderr,"]\n"); chr_q = (int*)malloc(sizeof(int)*(max_rec + ARRAY_MARGIN)); str_q = (int*)malloc(sizeof(int)*(max_rec + ARRAY_MARGIN)); pos_q = (int*)malloc(sizeof(int)*(max_rec + ARRAY_MARGIN)); chr_t = (int**)malloc(sizeof(int*)*(max_rec + ARRAY_MARGIN)); str_t = (int**)malloc(sizeof(int*)*(max_rec + ARRAY_MARGIN)); pos_t = (int**)malloc(sizeof(int*)*(max_rec + ARRAY_MARGIN)); score = (int**)malloc(sizeof(int*)*(max_rec + ARRAY_MARGIN)); lbest = (int**)malloc(sizeof(int*)*(max_rec + ARRAY_MARGIN)); jbest = (int**)malloc(sizeof(int*)*(max_rec + ARRAY_MARGIN)); count = (int*)malloc(sizeof(int)*(max_rec + ARRAY_MARGIN)); ptr = (int*)malloc(sizeof(int)*(max_rec + ARRAY_MARGIN)); if(chr_q == NULL || str_q == NULL || pos_q == NULL || count == NULL || ptr == NULL || chr_t == NULL || str_t == NULL || pos_t == NULL) { fprintf(stderr,"[ERROR: not enough memory, exiting]\n"); exit(1); } for(i=0;i<max_rec;i++) count[i] = ptr[i] = 0; if(verbose) fprintf(stderr,"[<%s, pass 2", aln_file_name); fseek (aln_file, 0, SEEK_SET); n=0; while(fgets(buff, MAXBUFFLENGTH, aln_file)) { if(strlen(buff)<2) break; sscanf(buff,"%s %i %c" , &chr1[0], &pos1, &c); str1 = strand_c2i(c); if(assign_code(chr1) != chr_q[n] || str1 != str_q[n] || pos1 != pos_q[n]) { n++; chr_q[n] = assign_code(chr1); str_q[n] = str1; pos_q[n] = pos1; } count[n]++; } if(verbose) fprintf(stderr,"]\n"); for(i=1; i<=n; i++) { chr_t[i] = (int*)malloc(sizeof(int)*(count[i] + ARRAY_MARGIN)); pos_t[i] = (int*)malloc(sizeof(int)*(count[i] + ARRAY_MARGIN)); str_t[i] = (int*)malloc(sizeof(int)*(count[i] + ARRAY_MARGIN)); score[i] = (int*)malloc(sizeof(int)*(count[i] + ARRAY_MARGIN)); lbest[i] = (int*)malloc(sizeof(int)*(count[i] + ARRAY_MARGIN)); jbest[i] = (int*)malloc(sizeof(int)*(count[i] + ARRAY_MARGIN)); if(chr_t[i] == NULL || str_t[i] == NULL || pos_t[i] == NULL) { fprintf(stderr,"[ERROR: not enough memory, exiting]\n"); exit(1); } } if(verbose) fprintf(stderr,"[<%s, pass 3", aln_file_name); fseek (aln_file, 0, SEEK_SET); n=0; while(fgets(buff, MAXBUFFLENGTH, aln_file)) { if(strlen(buff)<2) break; sscanf(buff,"%s %i %c %s %i %c" , &chr1[0], &pos1, &c1, &chr2[0], &pos2, &c2); str1 = strand_c2i(c1); str2 = strand_c2i(c2); if(assign_code(chr1) != chr_q[n] || str1 != str_q[n] || pos1 != pos_q[n]) n++; chr_t[n][ptr[n]] = assign_code(chr2); pos_t[n][ptr[n]] = pos2; str_t[n][ptr[n]] = str2; ptr[n]++; } if(verbose) fprintf(stderr,"]\n"); for(i=1; i<=n; i++) { progressbar(i, n, (char*)"Filtering ", verbose); for(k=0; k<count[i]; k++) { score[i][k] = 0; lbest[i][k] = -1; jbest[i][k] = -1; for(j=i-1; j>0 && i-j<=max_depth; j--) { a = abs(pos_q[i] - pos_q[j]); dmin = INFTY; lmin = -1; for(l=0; l<count[j]; l++) { if(chr_t[i][k] == chr_t[j][l] && str_t[i][k]*str_t[j][l] == str_q[i]*str_q[j]) { b = abs(pos_t[i][k] - pos_t[j][l]); d = abs(b-a); if(d<dmin) { dmin = d; lmin = l; } } } s = (lmin>=0 && (((double)dmin/a) < dthreshold || dmin < dlimit)) ? score[j][lmin] + a : 0; if(s > score[i][k]) { score[i][k] = s; jbest[i][k] = j; lbest[i][k] = lmin; } } } } for(i=n;i>0;i--) { s_max = 0; k_max = -1; for(k=0; k<count[i]; k++) { if(score[i][k]>s_max) { s_max = score[i][k]; k_max = k; } } if(k_max>=0) { k = k_max; while(jbest[i][k]>=0 && lbest[i][k]>=0) { fprintf(out_file,"%s\t%i\t%c\t", get_chr_name(chr_q[i]), pos_q[i], strand_i2c(str_q[i])); fprintf(out_file,"%s\t%i\t%c\n", get_chr_name(chr_t[i][k]), pos_t[i][k], strand_i2c(str_t[i][k])); j = jbest[i][k]; l = lbest[i][k]; i = j; k = l; } } } timestamp_report(); exit(0); }