int main(int argc, char* argv[]) { char out_file_name[MAXBUFFLENGTH]=""; char input_file_name[MAXBUFFLENGTH]=""; char idx_file_name[MAXBUFFLENGTH]; char dbx_file_name[MAXBUFFLENGTH]; long offset; long seqlen; long intronic_window = 150; long exonic_window = 0; char filename[MAXBUFFLENGTH]; char name[MAXBUFFLENGTH]; char chr_name[MAXBUFFLENGTH]; char buff[MAXBUFFLENGTH]; char longbuff[MAXLONGBUFFLENGTH]; char longbuffm[MAXLONGBUFFLENGTH]; FILE *idx_file; FILE *dbx_file; FILE *input_file; FILE *outfile; int b,i,j,q,k,n,a,m,s; char c; long position,y; int strand; long** pos; int** str; char** typ; char*** ids; long p,l; int record_count[MAXCHR]; int record_idx[MAXCHR]; int cis = 0; int coord = 0; int all=0; int warnings = 0; char format[][64] = {"%*s %*i %*i %s %li %i %*s %s %c", "%s %li %i %*s %s %c"}; if(argc==1) { fprintf(stderr,"This routine get sequence segments from a custom compressed FASTA repository (see transf)\n"); fprintf(stderr,"Last update by Dmitri Pervouchine ([email protected]) on Mar 22, 2013\n"); fprintf(stderr," -in <aln_file>\n -dbx <database_file>\n -idx <index_file>\n -out <output_file>\n"); fprintf(stderr," -we <exonic_window> [default=%i]\n -wi <intronic_window> [default=%i]\n -cis [use colunms 1-3] [default=%i]\n", exonic_window, intronic_window, cis); fprintf(stderr," -quiet <suppress verbose output> [default=no]\n -all <include all sites>\n -coord <offset for 3'-sites> [default=%i]\n",coord); exit(1); } timestamp_set(); for(i=1;i<argc;i++) { if(strcmp(argv[i],"-in")==0) { sscanf(argv[++i], "%s", &input_file_name[0]); } if(strcmp(argv[i],"-dbx")==0) { sscanf(argv[++i], "%s", &dbx_file_name[0]); } if(strcmp(argv[i],"-idx")==0) { sscanf(argv[++i], "%s", &idx_file_name[0]); } if(strcmp(argv[i],"-out")==0) { sscanf(argv[++i], "%s", &out_file_name[0]); } if(strcmp(argv[i],"-we")==0) { sscanf(argv[++i], "%li", &exonic_window); } if(strcmp(argv[i],"-wi")==0) { sscanf(argv[++i], "%li", &intronic_window); } if(strcmp(argv[i],"-coord")==0) { sscanf(argv[++i], "%i", &coord); } if(strcmp(argv[i],"-quiet")==0) { verbose = 0; } if(strcmp(argv[i],"-cis")==0) { cis = 1; } if(strcmp(argv[i],"-all")==0) { all = 1; } } if(out_file_name[0]==0) { fprintf(stderr,"[WARNING: output file not specified, redirect to stdout]\n"); outfile = stdout; } else { outfile = fopen(out_file_name,"w"); if(outfile == NULL) { fprintf(stderr,"[ERROR: output file %s cannot be opened for writing, exiting]\n", out_file_name); exit(1); } if(verbose) fprintf(stderr,"[>%s]\n",out_file_name); } input_file = fopen(input_file_name,"r"); if(input_file == NULL) { fprintf(stderr,"[ERROR: cannot access %s, exiting]\n", input_file_name); exit(1); } if(verbose) fprintf(stderr,"[<%s, pass 1",input_file_name); while(fgets(buff,MAXBUFFLENGTH,input_file)) { if(strlen(buff)<2) break; sscanf(buff, format[cis], &chr_name[0], &position, &strand, &name[0], &c); n = assign_code(chr_name); record_count[n]++; } if(verbose) fprintf(stderr,"]\n"); pos = (long**) malloc(sizeof(long*)*(N_CHR_NAMES+1)); str = (int**) malloc(sizeof(int*)*(N_CHR_NAMES+1)); typ = (char**) malloc(sizeof(char*)*(N_CHR_NAMES+1)); ids = (char***)malloc(sizeof(char**)*(N_CHR_NAMES+1)); if(pos==NULL || str==NULL || typ==NULL || ids==NULL) { fprintf(stderr,"[ERROR: failed to create index tables, exiting]\n"); exit(1); } for(i=0;i<N_CHR_NAMES;i++) { if(record_count[i]>0) { pos[i] = (long*) malloc(sizeof(long)*(record_count[i]+1)); str[i] = (int*) malloc(sizeof(int)*(record_count[i]+1)); typ[i] = (char*) malloc(sizeof(char)*(record_count[i]+1)); ids[i] = (char**) malloc(sizeof(char*)*(record_count[i]+1)); if(pos[i]==NULL || str[i]==NULL || typ[i]==NULL || ids[i]==NULL) { fprintf(stderr,"[ERROR: failed to create index tables, exiting]\n"); exit(1); } record_idx[i]=0; } } if(verbose) fprintf(stderr,"[<%s, pass 2",input_file_name); fseek(input_file, 0, SEEK_SET); while(fgets(buff,MAXBUFFLENGTH,input_file)) { if(strlen(buff)<2) break; sscanf(buff, format[cis], &chr_name[0], &position, &strand, &name[0], &c); i = get_chr_code(chr_name); j = record_idx[i]; pos[i][j] = position + (c=='D' ? coord*strand : 0); str[i][j] = strand; typ[i][j] = c; ids[i][j] = (char*) malloc(sizeof(char)*(strlen(name)+1)); strcpy(ids[i][j],name); record_idx[i]++; } if(verbose) fprintf(stderr,"]\n"); if(verbose) fprintf(stderr,"[<%s,%s",idx_file_name,dbx_file_name); idx_file = fopen(idx_file_name,"r"); dbx_file = fopen(dbx_file_name,"r"); if(idx_file == NULL || dbx_file == NULL) { fprintf(stderr,"[ERROR: cannot access %s or %s, exiting]\n", idx_file_name, dbx_file_name); exit(1); } offset = 0; while(fgets(buff,MAXBUFFLENGTH,idx_file)) { if(strlen(buff)<2) break; sscanf(buff,"%s" , &name[0]); while(fgets(buff,MAXBUFFLENGTH,idx_file)) { if(strlen(buff)<2) break; sscanf(buff,"%s %li" , &chr_name[0], &seqlen); i = get_chr_code(chr_name); for(k=0;k<record_count[i];k++) { if(pos[i][k]>seqlen) { warnings++; continue; } l = exonic_window + intronic_window; if(typ[i][k]=='D' || typ[i][k]=='A' || all) { if(str[i][k]>0) { p = pos[i][k] - 1 - (typ[i][k]=='D' ? exonic_window : intronic_window); } else { p = pos[i][k] - (typ[i][k]=='D' ? intronic_window : exonic_window); } fget_segment(longbuff, dbx_file, offset, p, l); if(str[i][k]<0) { rev1(longbuff); } if(is_all_n(longbuff)) continue; fprintf(outfile,"%s\t%s\t%li\t%li\t%i\t%li\t%s\t%c\n",ids[i][k], chr_name, (str[i][k]>0 ? p + 1 : seqlen - (p + l)), l, str[i][k], seqlen, longbuff,typ[i][k]); } } offset+= (seqlen % 8 == 0) ? seqlen/8 : (seqlen/8 + 1); } } fclose(outfile); fclose(idx_file); fclose(dbx_file); if(verbose) fprintf(stderr,"]\n"); if(verbose && warnings>0) fprintf(stderr,"[WARNING: %i windows were out of range, they were ignored]\n", warnings); timestamp_report(); exit(0); }
int main(int argc, char* argv[]) { char cpsfilename[MAXBUFFLENGTH]; char chainfilename[MAXBUFFLENGTH]; char outfilename[MAXBUFFLENGTH]=""; int marginlength = 0; int MAXREC; char c; char *pc; int x; char buff[MAXBUFFLENGTH+1]; char aux[MAXBUFFLENGTH+1]; long score; int start1,end1,len1,start2,end2,len2; char strand1, strand2, chr1[MAXBUFFLENGTH], chr2[MAXBUFFLENGTH]; int *size, *dq, *dt; int a,b,k,i,j,s,m; int *position; char *strand; int *ids; int *idg; char *type; int chridx[MAXCHR+1]; int chroff[MAXCHR+1]; char resstr; int rescrd; if(argc==1) { fprintf(stderr,"Finds matches of the given set of sites (CPS file) in the BLASTZ chain alignment (CHAIN file)\n"); fprintf(stderr,"Last update by (dp) on Sep 21, 2011\n"); fprintf(stderr,"Keys:\n -i CPS file (remember to sort by position in ascending order)\n -d CHAIN alignment file\n -o output file\n"); fprintf(stderr," -m margin length [0]\n -v suppress verbose output [NO]\n"); exit(1); } timestamp_set(); for(i=1;i<argc;i++) { pc = argv[i]; if(*pc != '-') continue; if(*(pc+1) == 'i') { sscanf(argv[++i], "%s", &cpsfilename[0]); } if(*(pc+1) == 'd') { sscanf(argv[++i], "%s", &chainfilename[0]); } if(*(pc+1) == 'o') { sscanf(argv[++i], "%s", &outfilename[0]); } if(*(pc+1) == 'm') { sscanf(argv[++i], "%i", &marginlength); } if(*(pc+1) == 'v') { verbose=0; } } if(outfilename[0]==0) { fprintf(stderr,"No output file privided, exiting\n"); exit(1); } outfile = fopen(outfilename,"w"); if(outfile == NULL) { fprintf(stderr,"Can't open output file, exiting\n"); exit(1); } for(i=0;i<MAXCHR;i++) { chridx[i]=chroff[i]=0; } MAXREC = 0; cpsfile= fopen(cpsfilename,"r"); if(cpsfile==NULL) { fprintf(stderr,"Can't access CPS file. Exiting\n"); exit(1); } if(verbose) fprintf(stderr,"Reading CPS input pass 1"); while(!feof(cpsfile)) { buff[0]=0; fgets(buff,MAXBUFFLENGTH,cpsfile); if(strlen(buff)<2) break; sscanf(buff,"%s" , aux); chridx[assign_code(aux)]++; MAXREC++; } fclose(cpsfile); for(s=i=0;i<MAXCHR;i++) { x = chridx[i]; chridx[i] =s; s+=x; } chridx[i] = s; position = (int*) malloc(sizeof(int)*(s+4)); strand = (char*) malloc(sizeof(char)*(s+4)); type = (char*) malloc(sizeof(char)*(s+4)); ids = (int*) malloc(sizeof(int)*(s+4)); idg = (int*) malloc(sizeof(int)*(s+4)); if(position==NULL || strand==NULL || type==NULL || ids==NULL || idg==NULL) { fprintf(stderr,"Not enough memory. Terminated\n"); exit(1); } cpsfile= fopen(cpsfilename,"r"); if(verbose) fprintf(stderr,", records = %i\nReading CPS input pass 2",MAXREC); while(!feof(cpsfile)) { buff[0]=0; fgets(buff,MAXBUFFLENGTH,cpsfile); if(strlen(buff)<2) break; sscanf(buff,"%s" , aux); i = assign_code(aux); m = chridx[i]+chroff[i]; sscanf(buff,"%*s %i %c %i %i %c" , position+m, strand+m, idg+m, ids+m, type+m); chroff[i]++; } fclose(cpsfile); if(verbose) fprintf(stderr,"\nSorting segments"); /* for(i=0;i<MAXCHR;i++) { quickSort_ic(position,strand,chridx[i],chridx[i+1]-1); } */ for(i=0;i<MAXCHR;i++) { k=1; while(k) { k=0; for(j=chridx[i];j<chridx[i+1]-1;j++) { if(position[j]>position[j+1]) { k=1; swapi(position+j,position+j+1); swapc(strand+j,strand+j+1); swapc(type+j,type+j+1); swapi(ids+j,ids+j+1); swapi(idg+j,idg+j+1); } } } } if(verbose) fprintf(stderr," done\nProcessing chains"); /**********************************************************************************************/ size = (int*) malloc(sizeof(int)*MAXALN); dq = (int*) malloc(sizeof(int)*MAXALN); dt = (int*) malloc(sizeof(int)*MAXALN); if(size ==0 || dq ==0 || dt==0) { fprintf(stderr,"Not enough memory for such long chains. Terminated\n"); exit(1); } /**********************************************************************************************/ chainfile = fopen(chainfilename,"r"); while(!feof(chainfile)) { buff[0]=0; fgets(buff,MAXBUFFLENGTH,chainfile); if(strlen(buff)<2) break; buff[5]=0; if(strcmp(buff,"chain")==0) { sscanf(buff+6,"%li %s %i %c %i %i %s %i %c %i %i",&score, &chr1[0], &len1, &strand1, &start1, &end1, &chr2[0], &len2, &strand2, &start2, &end2); k=0; while(!feof(chainfile)) { buff[0]=0; fgets(buff,MAXBUFFLENGTH,chainfile); if(strlen(buff)<2) break; sscanf(buff,"%i %i %i",&size[k],&dt[k],&dq[k]); k++; if(k>MAXALN) { fprintf(stderr,"Chain length exceeded. Terminating"); exit(1); } } x = get_chr_code(chr1); if(x<0) continue; a=start1;b=start2; j=0; for(i=chridx[x];i<chridx[x+1] && position[i]<start1;i++); for(;i<chridx[x+1]&& position[i]<end1;i++) { while(position[i]>a+size[j]+dt[j] && j<k){ a+=size[j]+dt[j]; b+=size[j]+dq[j]; j++; } if(j>=k) break; if(position[i]-a > marginlength && a+size[j]-position[i] >= marginlength) { if(strand1==strand2) { resstr = strand[i]; rescrd = position[i] - a + b; } else { resstr = (strand[i]=='+') ? '-' : '+'; rescrd = len2 - (position[i] - a + b - 1) ; } fprintf(outfile,"%s\t%i\t%c\t%s\t%i\t%c\t%i\t%i\t%c\t%li\n",chr1, position[i], strand[i],chr2,rescrd, resstr, idg[i], ids[i],type[i],score); } } } } if(verbose) fprintf(stderr," done\n"); fclose(chainfile); fclose(outfile); timestamp_report(); exit(0); }
int main(int argc, char* argv[]) { char cps_file_name[MAXBUFFLENGTH]; char chain_file_name[MAXBUFFLENGTH]; char out_file_name[MAXBUFFLENGTH]=""; int marginlength = 0; char buff[MAXBUFFLENGTH+1]; char aux[MAXBUFFLENGTH+1]; long score; int start1,end1,len1,start2,end2,len2; char strand1, strand2, chr1[MAXBUFFLENGTH], chr2[MAXBUFFLENGTH]; int *size, *dq, *dt; int a,b,k,i,j,s,m,x; int *position; int *strand; int *idg; int *ids; char *type; char c; int chridx[MAXCHR+1]; int chroff[MAXCHR+1]; char resstr; int rescrd; long chain_id; if(argc==1) { fprintf(stderr,"This utility does liftOver of coordinates (cps) by using chain alignment\n"); fprintf(stderr,"Gene information is included in the output\n"); fprintf(stderr,"Last update by Dmitri Pervouchine ([email protected]) on Mar 22, 2013\n"); fprintf(stderr,"Usage: %s -in <cps_file> -chain <chain_alignment_file> [-margin <length>] [-quiet]\n", argv[0]); fprintf(stderr," -in cps6, i.e. chr1/position1/strand1/gene/site/type tab-delimited file, strand is +/-\n"); fprintf(stderr," -chain UCSC chain alignment file, species1=>2\n"); fprintf(stderr," -out <output_file> [default=stdout]\n"); fprintf(stderr," -margin margin length [default=0]\n -quiet suppress verbose output [default=NO]\n"); fprintf(stderr,"NOTE: Input has to be sorted by position!\n"); fprintf(stderr,"Output format cps3+cps6: chr1/position1/strand1/chr2/position2/strand2/gene/site/type/score\n"); exit(1); } timestamp_set(); for(i=1;i<argc;i++) { if(strcmp(argv[i],"-in")==0) { sscanf(argv[++i], "%s", &cps_file_name[0]); } if(strcmp(argv[i],"-chain")==0) { sscanf(argv[++i], "%s", &chain_file_name[0]); } if(strcmp(argv[i],"-out")==0) { sscanf(argv[++i], "%s", &out_file_name[0]); } if(strcmp(argv[i],"-margin")==0) { sscanf(argv[++i], "%i", &marginlength); } if(strcmp(argv[i],"-quiet")==0) { verbose=0; } } if(out_file_name[0]==0) { fprintf(stderr,"[WARNING: output file not specified, redirect to stdout]\n"); out_file = stdout; } else { out_file = fopen(out_file_name,"w"); if(out_file == NULL) { fprintf(stderr,"[ERROR: output file %s cannot be opened for writing, exiting]\n", out_file_name); exit(1); } if(verbose) fprintf(stderr,"[>%s]\n",out_file_name); } cps_file= fopen(cps_file_name,"r"); if(cps_file==NULL) { fprintf(stderr,"[ERROR: cannot access %s, exiting]\n", cps_file_name); exit(1); } for(i=0;i<MAXCHR;i++) chridx[i] = chroff[i] = 0; if(verbose) fprintf(stderr,"[<%s, pass 1",cps_file_name); while(fgets(buff,MAXBUFFLENGTH,cps_file)) { if(strlen(buff)<2) break; sscanf(buff,"%s" , aux); chridx[assign_code(aux)]++; } if(verbose) fprintf(stderr,"]\n"); for(s=i=0;i<MAXCHR;i++) { x = chridx[i]; chridx[i] =s; s+=x; } chridx[i] = s; position = (int*) malloc(sizeof(int)*(s + ARRAY_MARGIN)); strand = (int*) malloc(sizeof(int)*(s + ARRAY_MARGIN)); ids = (int*) malloc(sizeof(int)*(s + ARRAY_MARGIN)); idg = (int*) malloc(sizeof(int)*(s + ARRAY_MARGIN)); type = (char*) malloc(sizeof(char)*(s + ARRAY_MARGIN)); if(position==NULL || strand==NULL || type==NULL || ids==NULL || idg==NULL) { fprintf(stderr,"[ERROR: failed to create index tables, exiting]\n"); exit(1); } fseek (cps_file, 0, SEEK_SET); if(verbose) fprintf(stderr,"[<%s, pass 2", cps_file_name); while(fgets(buff,MAXBUFFLENGTH,cps_file)) { if(strlen(buff)<2) break; sscanf(buff,"%s" , aux); i = assign_code(aux); m = chridx[i]+chroff[i]; sscanf(buff,"%*s %i %c %i %i %c" , &position[m], &c,&idg[m],&ids[m],&type[m]); strand[m] = strand_c2i(c); chroff[i]++; } fclose(cps_file); if(verbose) fprintf(stderr,"]\n"); if(verbose) fprintf(stderr,"[Sort by position (if not done before)"); for(i=0;i<MAXCHR;i++) { k=1; while(k) { k=0; for(j=chridx[i];j<chridx[i+1]-1;j++) { if(position[j]>position[j+1]) { k=1; swapi(position+j,position+j+1); swapi(strand+j,strand+j+1); swapi(idg+j,idg+j+1); swapi(ids+j,ids+j+1); swapc(type+j,type+j+1); } } } } if(verbose) fprintf(stderr,"]\n"); /**********************************************************************************************/ size = (int*) malloc(sizeof(int)*(MAXALN + ARRAY_MARGIN)); dq = (int*) malloc(sizeof(int)*(MAXALN + ARRAY_MARGIN)); dt = (int*) malloc(sizeof(int)*(MAXALN + ARRAY_MARGIN)); if(size ==0 || dq ==0 || dt==0) { fprintf(stderr,"[ERROR: not enough memory for chains, exiting]\n"); exit(1); } /**********************************************************************************************/ chain_file = fopen(chain_file_name,"r"); if(chain_file==NULL) { fprintf(stderr,"[ERROR: cannot access %s, exiting]\n", chain_file_name); exit(1); } fseek(chain_file, 0, SEEK_END); unsigned int last_pos = ftell(chain_file); fseek(chain_file, 0, SEEK_SET); while(fgets(buff,MAXBUFFLENGTH,chain_file)) { if(strlen(buff)<2) break; buff[5]=0; if(strcmp(buff,"chain")==0) { sscanf(buff+6,"%li %s %i %c %i %i %s %i %c %i %i %li",&score, &chr1[0], &len1, &strand1, &start1, &end1, &chr2[0], &len2, &strand2, &start2, &end2, &chain_id); k=0; while(fgets(buff,MAXBUFFLENGTH,chain_file)) { if(strlen(buff)<2) break; progressbar(ftell(chain_file), last_pos-1, (char*)"Processing ", verbose); sscanf(buff,"%i %i %i",size + k, dt + k, dq + k); k++; if(k>=MAXALN) { fprintf(stderr,"[ERROR: chain too long, exiting]\n"); exit(1); } } x = get_chr_code(chr1); if(x<0) continue; a=start1;b=start2; j=0; for(i=chridx[x];i<chridx[x+1] && position[i]<start1;i++); for(;i<chridx[x+1]&& position[i]<end1;i++) { while(position[i]>a+size[j]+dt[j] && j<k){ a+=size[j]+dt[j]; b+=size[j]+dq[j]; j++; } if(j>=k) break; if(position[i]-a > marginlength && a+size[j]-position[i] >= marginlength) { if(strand1==strand2) { resstr = strand[i]; rescrd = position[i] - a + b; } else { resstr = -strand[i]; rescrd = len2 - (position[i] - a + b - 1) ; } fprintf(out_file,"%s\t%i\t%c\t%s\t%i\t%c\t%i\t%i\t%c\t%li\n",chr1, position[i], strand_i2c(strand[i]), chr2, rescrd, strand_i2c(resstr), idg[i], ids[i],type[i],score); } } } } fclose(chain_file); fclose(out_file); timestamp_report(); free(size); free(dq); free(dt); free(position); free(strand); exit(0); }