int main(int argc, char *argv[]) { if(argc!=6 && argc!=7 && argc!=8 && argc!=9) { printf("usage: %s sourceGeneSuffixTree reverseSourceGeneSuffixTree TargetGene minimumLength minimumRepetition [outputFile]\n", argv[0]); printf(" %s sourceGeneSuffixTree reverseSourceGeneSuffixTree TargetGeneSuffixTree reverseTargetGeneSuffixTree minimumLength minimumSourceRepetition minimumTargetRepetition [outputFile]\n", argv[0]); exit(EXIT_FAILURE); } check_target_repetitions=(argc==8 || argc==9); strncpy(st_fileName, argv[1], 49); st_fileName[49]='\0'; strncpy(rst_fileName, argv[2], 49); rst_fileName[49]='\0'; fileoutput=(argc==7 || argc==9); if(fileoutput) covering_pieces.clear(); if(check_target_repetitions) { strncpy(st2_fileName, argv[3], 49); st2_fileName[49]='\0'; strncpy(rst2_fileName, argv[4], 49); rst2_fileName[49]='\0'; l=atoi(argv[5]); minRep=atoi(argv[6]); minRep2=atoi(argv[7]); if(fileoutput) strncpy(output_fileName, argv[8], 99); } else { strncpy(target_fileName, argv[3], 49); target_fileName[49]='\0'; l=atoi(argv[4]); minRep=atoi(argv[5]); if(fileoutput) strncpy(output_fileName, argv[6], 99); } if(fileoutput) { output_fileName[99]='\0'; int filename_length=strlen(output_fileName); if(filename_length>90) { fprintf(stderr, "error: \"%s\" is too much long, use a prefix shorter 90 characters or less\n", output_fileName); exit(EXIT_FAILURE); } output_fileName[filename_length]='.'; output_fileName[filename_length+1]='l'; output_fileName[filename_length+2]='o'; output_fileName[filename_length+3]='g'; output_fileName[filename_length+4]='\0'; if((output_file=fopen(output_fileName, "w"))==NULL) { fprintf(stderr, "error: %s opening fail\n", output_fileName); exit(EXIT_FAILURE); } output_fileName[filename_length]='.'; output_fileName[filename_length+1]='p'; output_fileName[filename_length+2]='i'; output_fileName[filename_length+3]='e'; output_fileName[filename_length+4]='c'; output_fileName[filename_length+5]='e'; output_fileName[filename_length+6]='s'; output_fileName[filename_length+7]='\0'; if((output_file_pieces=fopen(output_fileName, "w"))==NULL) { fprintf(stderr, "error: %s opening fail\n", output_fileName); exit(EXIT_FAILURE); } } // intestation(); complement[C]=G; complement[G]=C; complement[A]=T; complement[T]=A; complement[X]=X; base2char[A]='A'; base2char[G]='G'; base2char[T]='T'; base2char[C]='C'; base2char[X]='X'; char2base['A']=A; char2base['G']=G; char2base['T']=T; char2base['C']=C; char2base['X']=X; // printf("Loading %s.\n", st_fileName); if((st_file=fopen(st_fileName, "r"))==NULL) { fprintf(stderr, "error: %s opening fail\n", st_fileName); exit(EXIT_FAILURE); } if(fread(&(stringLength), sizeof(int), 1, st_file)!=1) { fprintf(stderr, "failed fread of stringLength\n"); exit(EXIT_FAILURE); } s=new bpmatch_utils_base[stringLength]; if((int)fread(s, sizeof(bpmatch_utils_base), stringLength, st_file)!=stringLength) { fprintf(stderr, "failed fread of source string\n"); exit(EXIT_FAILURE); } st_root=new st_node; st_leaves=new st_node*[stringLength+2]; st_unserialize_node(st_file, st_root, st_leaves); int st_file_fd; if((st_file_fd=fileno(st_file))==-1) { fprintf(stderr, "failed getting %s file descriptor", st_fileName); exit(EXIT_FAILURE); } if(fsync(st_file_fd)!=0) { fprintf(stderr, "failed fsync of %s", st_fileName); exit(EXIT_FAILURE); } if(fclose(st_file)!=0) { fprintf(stderr, "failed fclose of %s", st_fileName); exit(EXIT_FAILURE); } // for(int i=0;i<stringLength;i++) printf("%c", base2char[s[i]]); // printf("\n"); // st_print(st_root); // printf("suffix tree unserialized.\n"); // printf("Loading %s.\n", rst_fileName); if((rst_file=fopen(rst_fileName, "r"))==NULL) { fprintf(stderr, "error: %s opening fail\n", rst_fileName); exit(EXIT_FAILURE); } if(fread(&(stringLength), sizeof(int), 1, rst_file)!=1) { fprintf(stderr, "failed fread of (reversed) stringLength\n"); exit(EXIT_FAILURE); } rs=new bpmatch_utils_base[stringLength]; if((int)fread(rs, sizeof(bpmatch_utils_base), stringLength, rst_file)!=stringLength) { fprintf(stderr, "failed fread of (reversed) source string\n"); exit(EXIT_FAILURE); } rst_root=new st_node; rst_leaves=new st_node*[stringLength+2]; st_unserialize_node(rst_file, rst_root, rst_leaves); int rst_file_fd; if((rst_file_fd=fileno(rst_file))==-1) { fprintf(stderr, "failed getting %s file descriptor", rst_fileName); exit(EXIT_FAILURE); } if(fsync(rst_file_fd)!=0) { fprintf(stderr, "failed fsync of %s", rst_fileName); exit(EXIT_FAILURE); } if(fclose(rst_file)!=0) { fprintf(stderr, "failed fclose of %s", rst_fileName); exit(EXIT_FAILURE); } // for(int i=0;i<stringLength;i++) printf("%c", base2char[rs[i]]); // printf("\n"); // st_print(rst_root); // printf("suffix tree unserialized.\n"); int trueStringLength; if(check_target_repetitions) { // printf("Loading %s.\n", st2_fileName); if((st2_file=fopen(st2_fileName, "r"))==NULL) { fprintf(stderr, "error: %s opening fail\n", st2_fileName); exit(EXIT_FAILURE); } if(fread(&(stringLength), sizeof(int), 1, st2_file)!=1) { fprintf(stderr, "failed fread of stringLength\n"); exit(EXIT_FAILURE); } t=new bpmatch_utils_base[stringLength]; if((int)fread(t, sizeof(bpmatch_utils_base), stringLength, st2_file)!=stringLength) { fprintf(stderr, "failed fread of source string\n"); exit(EXIT_FAILURE); } trueStringLength=stringLength; st2_root=new st_node; st2_leaves=new st_node*[stringLength+2]; st_unserialize_node(st2_file, st2_root, st2_leaves); int st2_file_fd; if((st2_file_fd=fileno(st2_file))==-1) { fprintf(stderr, "failed getting %s file descriptor", st2_fileName); exit(EXIT_FAILURE); } if(fsync(st2_file_fd)!=0) { fprintf(stderr, "failed fsync of %s", st2_fileName); exit(EXIT_FAILURE); } if(fclose(st2_file)!=0) { fprintf(stderr, "failed fclose of %s", st2_fileName); exit(EXIT_FAILURE); } // printf("Loading %s.\n", rst2_fileName); if((rst2_file=fopen(rst2_fileName, "r"))==NULL) { fprintf(stderr, "error: %s opening fail\n", rst2_fileName); exit(EXIT_FAILURE); } if(fread(&(stringLength), sizeof(int), 1, rst2_file)!=1) { fprintf(stderr, "failed fread of (reversed) stringLength\n"); exit(EXIT_FAILURE); } rt=new bpmatch_utils_base[stringLength]; if((int)fread(rt, sizeof(bpmatch_utils_base), stringLength, rst2_file)!=stringLength) { fprintf(stderr, "failed fread of (reversed) source string\n"); exit(EXIT_FAILURE); } rst2_root=new st_node; rst2_leaves=new st_node*[stringLength+2]; st_unserialize_node(rst2_file, rst2_root, rst2_leaves); int rst2_file_fd; if((rst2_file_fd=fileno(rst2_file))==-1) { fprintf(stderr, "failed getting %s file descriptor", rst2_fileName); exit(EXIT_FAILURE); } if(fsync(rst2_file_fd)!=0) { fprintf(stderr, "failed fsync of %s", rst2_fileName); exit(EXIT_FAILURE); } if(fclose(rst2_file)!=0) { fprintf(stderr, "failed fclose of %s", rst2_fileName); exit(EXIT_FAILURE); } } else { if((target_file=fopen(target_fileName, "r"))==NULL) { fprintf(stderr, "error: %s opening fail\n", target_fileName); exit(EXIT_FAILURE); } struct stat target_fileInfo; stat(target_fileName, &target_fileInfo); stringLength=target_fileInfo.st_size; t=new bpmatch_utils_base[stringLength+1]; trueStringLength=0; bpmatch_utils_base bp; scanBp_fasta=false; while(scanBp(target_file, &bp)!=EOF) t[trueStringLength++]=bp; t[trueStringLength++]=X; int target_file_fd; if((target_file_fd=fileno(target_file))==-1) { fprintf(stderr, "failed getting gene file descriptor"); exit(EXIT_FAILURE); } if(fsync(target_file_fd)!=0) { fprintf(stderr, "failed fsync of gene"); exit(EXIT_FAILURE); } if(fclose(target_file)!=0) { fprintf(stderr, "failed fclose of gene"); exit(EXIT_FAILURE); } } if(fileoutput) tmp_piece=new char[trueStringLength+1]; // printf("\n"); // printf("*************************************************\n"); // printf("matching - l=%d - minRep=%d\n", l, minRep); // printf("*************************************************\n"); bool end=false; bool found; int shiftTo=0; int maxPrefix, maxSuffix; int state=0; at=0; int i; int coverage=0; st_search* search_source_direct=new st_search; st_search* search_source_reverse=new st_search; st_search* search_target_direct=new st_search; st_search* search_target_reverse=new st_search; search_source_direct->root=st_root; search_source_direct->string=s; search_source_reverse->root=rst_root; search_source_reverse->string=rs; if(check_target_repetitions) { search_target_direct->root=st2_root; search_target_direct->string=t; search_target_reverse->root=rst2_root; search_target_reverse->string=rt; } while(!end) { found=false; if(state==0) { //case 0 if(DEBUG) printf("case 0\n"); //searching for direct seq i=l-1; //using reverse from actual+l-1 to actual (backward parsing) initialize_new_match(search_source_direct); initialize_new_match(search_source_reverse); if(check_target_repetitions) { initialize_new_match(search_target_direct); initialize_new_match(search_target_reverse); while(i>=0 && at+i<trueStringLength && composite_match_target_check(search_source_direct, search_source_reverse, search_target_direct, search_target_reverse, complement[t[at+i]])) i--; } else { while(i>=0 && at+i<trueStringLength && composite_match(search_source_direct, search_source_reverse, complement[t[at+i]])) i--; } if(i>=0) { //not found shiftTo=at+i+1; if(DEBUG) printf("not found [%d -> %d]\n", at, shiftTo); } else { //direct sequence found i=0; //using direct from actual (forward parsing) initialize_new_match(search_source_direct); initialize_new_match(search_source_reverse); if(check_target_repetitions) { initialize_new_match(search_target_direct); initialize_new_match(search_target_reverse); while(at+i<trueStringLength && composite_match_target_check(search_source_direct, search_source_reverse, search_target_direct, search_target_reverse, t[at+i])) i++; } else { while(at+i<trueStringLength && composite_match(search_source_direct, search_source_reverse, t[at+i])) i++; } found=true; shiftTo=at+i; if(DEBUG) printf("***************************************************found [%d -> %d]\n", at, shiftTo); } } else { //case 1 if(DEBUG) printf("case 1\n"); //searching for direct (eventually ovelapping) seq maxSuffix=0; maxPrefix=0; //using direct from actual (forward parsing) initialize_new_match(search_source_direct); initialize_new_match(search_source_reverse); if(check_target_repetitions) { initialize_new_match(search_target_direct); initialize_new_match(search_target_reverse); while(at+maxSuffix<trueStringLength && composite_match_target_check(search_source_direct, search_source_reverse, search_target_direct, search_target_reverse, t[at+maxSuffix])) maxSuffix++; } else { while(at+maxSuffix<trueStringLength && composite_match(search_source_direct, search_source_reverse, t[at+maxSuffix])) maxSuffix++; } if(maxSuffix>=l) { //direct sequence found found=true; shiftTo=at+maxSuffix; if(DEBUG) printf("1**************************************************found [%d -> %d]\n", at, shiftTo); } else { if(DEBUG) printf("compute maxPrefix\n"); //using reverse from actual (backward parsing) initialize_new_match(search_source_direct); initialize_new_match(search_source_reverse); if(check_target_repetitions) { initialize_new_match(search_target_direct); initialize_new_match(search_target_reverse); while(maxPrefix<l && at-maxPrefix>=0 && composite_match_target_check(search_source_direct, search_source_reverse, search_target_direct, search_target_reverse, complement[t[at-maxPrefix]])) maxPrefix++; } else { while(maxPrefix<l && at-maxPrefix>=0 && composite_match(search_source_direct, search_source_reverse, complement[t[at-maxPrefix]])) maxPrefix++; } maxPrefix--; while(!found && maxPrefix+maxSuffix>=l) { if(DEBUG) printf("*ite %d-%d\n", maxPrefix, maxSuffix); i=maxSuffix-l; //using direct from actual (forward parsing) initialize_new_match(search_source_direct); initialize_new_match(search_source_reverse); if(check_target_repetitions) { initialize_new_match(search_target_direct); initialize_new_match(search_target_reverse); while(at+i<trueStringLength && composite_match_target_check(search_source_direct, search_source_reverse, search_target_direct, search_target_reverse, t[at+i])) i++; } else { while(at+i<trueStringLength && composite_match(search_source_direct, search_source_reverse, t[at+i])) i++; } if(i==maxSuffix) { //direct sequence found found=true; shiftTo=at+maxSuffix; } else { if(DEBUG) printf("error at %d\n", i); maxSuffix=i; } } if(!found) { //not found shiftTo=at+1; } } } if(found) { if(fileoutput) { fprintf(output_file, "%i,", at); if(state==1) { fprintf(output_file, "*"); for(int j=shiftTo-l;j<shiftTo;j++) tmp_piece[j-shiftTo+l]=base2char[t[j]]; tmp_piece[l]='\0'; } else { for(int j=at;j<shiftTo;j++) tmp_piece[j-at]=base2char[t[j]]; tmp_piece[shiftTo-at]='\0'; } for(int j=at;j<shiftTo;j++) fprintf(output_file, "%c", base2char[t[j]]); if(check_target_repetitions) fprintf(output_file, ",direct-reverse:%d-%d,targetrepetitionsdirect-reverse:%d-%d\n", last_valid_direct_count, last_valid_reverse_count, last_valid_direct_count_target, last_valid_reverse_count_target); else fprintf(output_file, ",direct-reverse:%d-%d\n", last_valid_direct_count, last_valid_reverse_count); covering_pieces.insert(std::string(tmp_piece)); } coverage+=shiftTo-at; state=1; } else state=0; at=shiftTo; if(at>=trueStringLength) end=true; } // printf("\n\ncoverage: %d/%d\n", coverage, trueStringLength-1); printf("%f\n", (double(coverage))/((double)(trueStringLength-1))); if(fileoutput) { fprintf(output_file, "\ncoverage: %d/%d\n", coverage, trueStringLength-1); fprintf(output_file, "%f\n", (double(coverage))/((double)(trueStringLength-1))); if(fclose(output_file)!=0) { fprintf(stderr, "failed fclose of output"); exit(EXIT_FAILURE); } int search_index; for(covering_pieces_ite=covering_pieces.begin();covering_pieces_ite!=covering_pieces.end();covering_pieces_ite++) { fprintf(output_file_pieces, "SEGMENT: %s\n", covering_pieces_ite->data()); fprintf(output_file_pieces, "sources direct occurrences by starting index "); initialize_new_match(search_source_direct); search_index=0; while((*covering_pieces_ite)[search_index]!='\0') search_source_direct->current_count=single_match(search_source_direct, char2base[(int)(*covering_pieces_ite)[search_index++]]); fprintf(output_file_pieces, "(%d) :", search_source_direct->current_count); if(search_source_direct->current_count>0) { if(search_source_direct->current_node->leafId==0) { //internal node for(int i=search_source_direct->current_node->firstLeaf;i<=search_source_direct->current_node->lastLeaf;i++) { fprintf(output_file_pieces, " %d", st_root->lastLeaf-st_leaves[i]->length); } } else { //leaf fprintf(output_file_pieces, " %d", st_root->lastLeaf-search_source_direct->current_node->length); } } fprintf(output_file_pieces, "\n"); fprintf(output_file_pieces, "sources reverse occurrences by starting index "); initialize_new_match(search_source_reverse); search_index=0; while((*covering_pieces_ite)[search_index]!='\0') search_source_reverse->current_count=single_match(search_source_reverse, char2base[(int)(*covering_pieces_ite)[search_index++]]); fprintf(output_file_pieces, "(%d) :", search_source_reverse->current_count); if(search_source_reverse->current_count>0) { if(search_source_reverse->current_node->leafId==0) { //internal node for(int i=search_source_reverse->current_node->firstLeaf;i<=search_source_reverse->current_node->lastLeaf;i++) { fprintf(output_file_pieces, " %d", st_root->lastLeaf-st_leaves[i]->length); } } else { //leaf fprintf(output_file_pieces, " %d", st_root->lastLeaf-search_source_reverse->current_node->length); } } fprintf(output_file_pieces, "\n"); if(check_target_repetitions) { fprintf(output_file_pieces, "target direct occurrences by starting index "); initialize_new_match(search_target_direct); search_index=0; while((*covering_pieces_ite)[search_index]!='\0') search_target_direct->current_count=single_match(search_target_direct, char2base[(int)(*covering_pieces_ite)[search_index++]]); fprintf(output_file_pieces, "(%d) :", search_target_direct->current_count); if(search_target_direct->current_count>0) { if(search_target_direct->current_node->leafId==0) { //internal node for(int i=search_target_direct->current_node->firstLeaf;i<=search_target_direct->current_node->lastLeaf;i++) { fprintf(output_file_pieces, " %d", st2_root->lastLeaf-st_leaves[i]->length); } } else { //leaf fprintf(output_file_pieces, " %d", st2_root->lastLeaf-search_target_direct->current_node->length); } } fprintf(output_file_pieces, "\n"); fprintf(output_file_pieces, "target reverse occurrences by starting index "); initialize_new_match(search_target_reverse); search_index=0; while((*covering_pieces_ite)[search_index]!='\0') search_target_reverse->current_count=single_match(search_target_reverse, char2base[(int)(*covering_pieces_ite)[search_index++]]); fprintf(output_file_pieces, "(%d) :", search_target_reverse->current_count); if(search_target_reverse->current_count>0) { if(search_target_reverse->current_node->leafId==0) { //internal node for(int i=search_target_reverse->current_node->firstLeaf;i<=search_target_reverse->current_node->lastLeaf;i++) { fprintf(output_file_pieces, " %d", st2_root->lastLeaf-st_leaves[i]->length); } } else { //leaf fprintf(output_file_pieces, " %d", st2_root->lastLeaf-search_target_reverse->current_node->length); } } fprintf(output_file_pieces, "\n"); } fprintf(output_file_pieces, "\n"); } if(fclose(output_file_pieces)!=0) { fprintf(stderr, "failed fclose of output"); exit(EXIT_FAILURE); } delete[] tmp_piece; } // printf("\ncoverage: %d/%d\n", coverage, trueStringLength, subSeqId); // printf("%f (%d sequences used)\n\n", double(coverage)/double(trueStringLength-1), subSeqId); /// printf("%f", double(coverage)/double(trueStringLength-1)); delete[] st_leaves; st_free_node(st_root); delete[] rst_leaves; st_free_node(rst_root); delete[] s; delete[] rs; if(check_target_repetitions) { delete[] st2_leaves; st_free_node(st2_root); delete[] rst2_leaves; st_free_node(rst2_root); delete[] rt; } delete[] t; // printf("termination reached without errors\n"); exit(EXIT_SUCCESS); }