Пример #1
0
int main(int argc, char *argv[]) {
	if(argc!=6 && argc!=7 && argc!=8 && argc!=9) {
		printf("usage: %s sourceGeneSuffixTree reverseSourceGeneSuffixTree TargetGene minimumLength minimumRepetition [outputFile]\n", argv[0]);
		printf("       %s sourceGeneSuffixTree reverseSourceGeneSuffixTree TargetGeneSuffixTree reverseTargetGeneSuffixTree minimumLength minimumSourceRepetition minimumTargetRepetition [outputFile]\n", argv[0]);
		exit(EXIT_FAILURE);
		}

	check_target_repetitions=(argc==8 || argc==9);

	strncpy(st_fileName, argv[1], 49);
	st_fileName[49]='\0';
	strncpy(rst_fileName, argv[2], 49);
	rst_fileName[49]='\0';

	fileoutput=(argc==7 || argc==9);
	if(fileoutput) covering_pieces.clear();

	if(check_target_repetitions) {
		strncpy(st2_fileName, argv[3], 49);
		st2_fileName[49]='\0';
		strncpy(rst2_fileName, argv[4], 49);
		rst2_fileName[49]='\0';

		l=atoi(argv[5]);
		minRep=atoi(argv[6]);
		minRep2=atoi(argv[7]);

		if(fileoutput) strncpy(output_fileName, argv[8], 99);
		}
	else {
		strncpy(target_fileName, argv[3], 49);
		target_fileName[49]='\0';

		l=atoi(argv[4]);
		minRep=atoi(argv[5]);

		if(fileoutput) strncpy(output_fileName, argv[6], 99);
		}

	if(fileoutput) {
		output_fileName[99]='\0';
		int filename_length=strlen(output_fileName);
		if(filename_length>90) {
			fprintf(stderr, "error: \"%s\" is too much long, use a prefix shorter 90 characters or less\n", output_fileName);
			exit(EXIT_FAILURE);
			}

		output_fileName[filename_length]='.';
		output_fileName[filename_length+1]='l';
		output_fileName[filename_length+2]='o';
		output_fileName[filename_length+3]='g';
		output_fileName[filename_length+4]='\0';
		if((output_file=fopen(output_fileName, "w"))==NULL) {
			fprintf(stderr, "error: %s opening fail\n", output_fileName);
			exit(EXIT_FAILURE);
			}

		output_fileName[filename_length]='.';
		output_fileName[filename_length+1]='p';
		output_fileName[filename_length+2]='i';
		output_fileName[filename_length+3]='e';
		output_fileName[filename_length+4]='c';
		output_fileName[filename_length+5]='e';
		output_fileName[filename_length+6]='s';
		output_fileName[filename_length+7]='\0';
		if((output_file_pieces=fopen(output_fileName, "w"))==NULL) {
			fprintf(stderr, "error: %s opening fail\n", output_fileName);
			exit(EXIT_FAILURE);
			}
		}

//	intestation();
	complement[C]=G;
	complement[G]=C;
	complement[A]=T;
	complement[T]=A;
	complement[X]=X;
	base2char[A]='A';
	base2char[G]='G';
	base2char[T]='T';
	base2char[C]='C';
	base2char[X]='X';
	char2base['A']=A;
	char2base['G']=G;
	char2base['T']=T;
	char2base['C']=C;
	char2base['X']=X;
//	printf("Loading %s.\n", st_fileName);
	if((st_file=fopen(st_fileName, "r"))==NULL) {
		fprintf(stderr, "error: %s opening fail\n", st_fileName);
		exit(EXIT_FAILURE);
		}
	if(fread(&(stringLength), sizeof(int), 1, st_file)!=1) {
		fprintf(stderr, "failed fread of stringLength\n");
		exit(EXIT_FAILURE);
		}
	s=new bpmatch_utils_base[stringLength];
	if((int)fread(s, sizeof(bpmatch_utils_base), stringLength, st_file)!=stringLength) {
		fprintf(stderr, "failed fread of source string\n");
		exit(EXIT_FAILURE);
		}
	st_root=new st_node;
	st_leaves=new st_node*[stringLength+2];
	st_unserialize_node(st_file, st_root, st_leaves);
	int st_file_fd;
	if((st_file_fd=fileno(st_file))==-1) {
		fprintf(stderr, "failed getting %s file descriptor", st_fileName);
		exit(EXIT_FAILURE);
		}
	if(fsync(st_file_fd)!=0) {
		fprintf(stderr, "failed fsync of %s", st_fileName);
		exit(EXIT_FAILURE);
		}
	if(fclose(st_file)!=0) {
		fprintf(stderr, "failed fclose of %s", st_fileName);
		exit(EXIT_FAILURE);
		}
//	for(int i=0;i<stringLength;i++) printf("%c", base2char[s[i]]);
//	printf("\n");
//	st_print(st_root);
//	printf("suffix tree unserialized.\n");
//	printf("Loading %s.\n", rst_fileName);
	if((rst_file=fopen(rst_fileName, "r"))==NULL) {
		fprintf(stderr, "error: %s opening fail\n", rst_fileName);
		exit(EXIT_FAILURE);
		}
	if(fread(&(stringLength), sizeof(int), 1, rst_file)!=1) {
		fprintf(stderr, "failed fread of (reversed) stringLength\n");
		exit(EXIT_FAILURE);
		}
	rs=new bpmatch_utils_base[stringLength];
	if((int)fread(rs, sizeof(bpmatch_utils_base), stringLength, rst_file)!=stringLength) {
		fprintf(stderr, "failed fread of (reversed) source string\n");
		exit(EXIT_FAILURE);
		}
	rst_root=new st_node;
	rst_leaves=new st_node*[stringLength+2];
	st_unserialize_node(rst_file, rst_root, rst_leaves);
	int rst_file_fd;
	if((rst_file_fd=fileno(rst_file))==-1) {
		fprintf(stderr, "failed getting %s file descriptor", rst_fileName);
		exit(EXIT_FAILURE);
		}
	if(fsync(rst_file_fd)!=0) {
		fprintf(stderr, "failed fsync of %s", rst_fileName);
		exit(EXIT_FAILURE);
		}
	if(fclose(rst_file)!=0) {
		fprintf(stderr, "failed fclose of %s", rst_fileName);
		exit(EXIT_FAILURE);
		}
//	for(int i=0;i<stringLength;i++) printf("%c", base2char[rs[i]]);
//	printf("\n");
//	st_print(rst_root);
//	printf("suffix tree unserialized.\n");

	int trueStringLength;
	if(check_target_repetitions) {
//		printf("Loading %s.\n", st2_fileName);
		if((st2_file=fopen(st2_fileName, "r"))==NULL) {
			fprintf(stderr, "error: %s opening fail\n", st2_fileName);
			exit(EXIT_FAILURE);
			}
		if(fread(&(stringLength), sizeof(int), 1, st2_file)!=1) {
			fprintf(stderr, "failed fread of stringLength\n");
			exit(EXIT_FAILURE);
			}
		t=new bpmatch_utils_base[stringLength];
		if((int)fread(t, sizeof(bpmatch_utils_base), stringLength, st2_file)!=stringLength) {
			fprintf(stderr, "failed fread of source string\n");
			exit(EXIT_FAILURE);
			}
		trueStringLength=stringLength;
		st2_root=new st_node;
		st2_leaves=new st_node*[stringLength+2];
		st_unserialize_node(st2_file, st2_root, st2_leaves);
		int st2_file_fd;
		if((st2_file_fd=fileno(st2_file))==-1) {
			fprintf(stderr, "failed getting %s file descriptor", st2_fileName);
			exit(EXIT_FAILURE);
			}
		if(fsync(st2_file_fd)!=0) {
			fprintf(stderr, "failed fsync of %s", st2_fileName);
			exit(EXIT_FAILURE);
			}
		if(fclose(st2_file)!=0) {
			fprintf(stderr, "failed fclose of %s", st2_fileName);
			exit(EXIT_FAILURE);
			}
//		printf("Loading %s.\n", rst2_fileName);
		if((rst2_file=fopen(rst2_fileName, "r"))==NULL) {
			fprintf(stderr, "error: %s opening fail\n", rst2_fileName);
			exit(EXIT_FAILURE);
			}
		if(fread(&(stringLength), sizeof(int), 1, rst2_file)!=1) {
			fprintf(stderr, "failed fread of (reversed) stringLength\n");
			exit(EXIT_FAILURE);
			}
		rt=new bpmatch_utils_base[stringLength];
		if((int)fread(rt, sizeof(bpmatch_utils_base), stringLength, rst2_file)!=stringLength) {
			fprintf(stderr, "failed fread of (reversed) source string\n");
			exit(EXIT_FAILURE);
			}
		rst2_root=new st_node;
		rst2_leaves=new st_node*[stringLength+2];
		st_unserialize_node(rst2_file, rst2_root, rst2_leaves);
		int rst2_file_fd;
		if((rst2_file_fd=fileno(rst2_file))==-1) {
			fprintf(stderr, "failed getting %s file descriptor", rst2_fileName);
			exit(EXIT_FAILURE);
			}
		if(fsync(rst2_file_fd)!=0) {
			fprintf(stderr, "failed fsync of %s", rst2_fileName);
			exit(EXIT_FAILURE);
			}
		if(fclose(rst2_file)!=0) {
			fprintf(stderr, "failed fclose of %s", rst2_fileName);
			exit(EXIT_FAILURE);
			}
		}
	else {
		if((target_file=fopen(target_fileName, "r"))==NULL) {
			fprintf(stderr, "error: %s opening fail\n", target_fileName);
			exit(EXIT_FAILURE);
			}
		struct stat target_fileInfo;
		stat(target_fileName, &target_fileInfo);
		stringLength=target_fileInfo.st_size;
		t=new bpmatch_utils_base[stringLength+1];
		trueStringLength=0;
		bpmatch_utils_base bp;
		scanBp_fasta=false;
		while(scanBp(target_file, &bp)!=EOF) t[trueStringLength++]=bp;
		t[trueStringLength++]=X;
		int target_file_fd;
		if((target_file_fd=fileno(target_file))==-1) {
			fprintf(stderr, "failed getting gene file descriptor");
			exit(EXIT_FAILURE);
			}
		if(fsync(target_file_fd)!=0) {
			fprintf(stderr, "failed fsync of gene");
			exit(EXIT_FAILURE);
			}
		if(fclose(target_file)!=0) {
			fprintf(stderr, "failed fclose of gene");
			exit(EXIT_FAILURE);
			}
		}

	if(fileoutput) tmp_piece=new char[trueStringLength+1];

//	printf("\n");
//	printf("*************************************************\n");
//	printf("matching - l=%d - minRep=%d\n", l, minRep);
//	printf("*************************************************\n");
	bool end=false;
	bool found;
	int shiftTo=0;
	int maxPrefix, maxSuffix;
	int state=0;
	at=0;
	int i;
	int coverage=0;
	st_search* search_source_direct=new st_search;
	st_search* search_source_reverse=new st_search;
	st_search* search_target_direct=new st_search;
	st_search* search_target_reverse=new st_search;
	search_source_direct->root=st_root;
	search_source_direct->string=s;
	search_source_reverse->root=rst_root;
	search_source_reverse->string=rs;
	if(check_target_repetitions) {
		search_target_direct->root=st2_root;
		search_target_direct->string=t;
		search_target_reverse->root=rst2_root;
		search_target_reverse->string=rt;
		}
	while(!end) {
		found=false;
		if(state==0) {
			//case 0
			if(DEBUG) printf("case 0\n");
			//searching for direct seq
			i=l-1;
			//using reverse from actual+l-1 to actual (backward parsing)
			initialize_new_match(search_source_direct);
			initialize_new_match(search_source_reverse);
			if(check_target_repetitions) {
				initialize_new_match(search_target_direct);
				initialize_new_match(search_target_reverse);
				while(i>=0 && at+i<trueStringLength && composite_match_target_check(search_source_direct, search_source_reverse, search_target_direct, search_target_reverse, complement[t[at+i]])) i--;
				}
			else {
				while(i>=0 && at+i<trueStringLength && composite_match(search_source_direct, search_source_reverse, complement[t[at+i]])) i--;
				}
			if(i>=0) {
				//not found
				shiftTo=at+i+1;
				if(DEBUG) printf("not found [%d -> %d]\n", at, shiftTo);
				}
			else {
				//direct sequence found
				i=0;
				//using direct from actual (forward parsing)
				initialize_new_match(search_source_direct);
				initialize_new_match(search_source_reverse);
				if(check_target_repetitions) {
					initialize_new_match(search_target_direct);
					initialize_new_match(search_target_reverse);
					while(at+i<trueStringLength && composite_match_target_check(search_source_direct, search_source_reverse, search_target_direct, search_target_reverse, t[at+i])) i++;
					}
				else {
					while(at+i<trueStringLength && composite_match(search_source_direct, search_source_reverse, t[at+i])) i++;
					}
				found=true;
				shiftTo=at+i;
				if(DEBUG) printf("***************************************************found [%d -> %d]\n", at, shiftTo);
				}
			}
		else {
			//case 1
			if(DEBUG) printf("case 1\n");
			//searching for direct (eventually ovelapping) seq
			maxSuffix=0;
			maxPrefix=0;
			//using direct from actual (forward parsing)
			initialize_new_match(search_source_direct);
			initialize_new_match(search_source_reverse);
			if(check_target_repetitions) {
				initialize_new_match(search_target_direct);
				initialize_new_match(search_target_reverse);
				while(at+maxSuffix<trueStringLength && composite_match_target_check(search_source_direct, search_source_reverse, search_target_direct, search_target_reverse, t[at+maxSuffix])) maxSuffix++;
				}
			else {
				while(at+maxSuffix<trueStringLength && composite_match(search_source_direct, search_source_reverse, t[at+maxSuffix])) maxSuffix++;
				}
			if(maxSuffix>=l) {
				//direct sequence found
				found=true;
				shiftTo=at+maxSuffix;
				if(DEBUG) printf("1**************************************************found [%d -> %d]\n", at, shiftTo);
				}
			else {
				if(DEBUG) printf("compute maxPrefix\n");
				//using reverse from actual (backward parsing)
				initialize_new_match(search_source_direct);
				initialize_new_match(search_source_reverse);
				if(check_target_repetitions) {
					initialize_new_match(search_target_direct);
					initialize_new_match(search_target_reverse);
					while(maxPrefix<l && at-maxPrefix>=0 && composite_match_target_check(search_source_direct, search_source_reverse, search_target_direct, search_target_reverse, complement[t[at-maxPrefix]])) maxPrefix++;
					}
				else {
					while(maxPrefix<l && at-maxPrefix>=0 && composite_match(search_source_direct, search_source_reverse, complement[t[at-maxPrefix]])) maxPrefix++;
					}
				maxPrefix--;
				while(!found && maxPrefix+maxSuffix>=l) {
					if(DEBUG) printf("*ite %d-%d\n", maxPrefix, maxSuffix);
					i=maxSuffix-l;
					//using direct from actual (forward parsing)
					initialize_new_match(search_source_direct);
					initialize_new_match(search_source_reverse);
					if(check_target_repetitions) {
						initialize_new_match(search_target_direct);
						initialize_new_match(search_target_reverse);
						while(at+i<trueStringLength && composite_match_target_check(search_source_direct, search_source_reverse, search_target_direct, search_target_reverse, t[at+i])) i++;
						}
					else {
						while(at+i<trueStringLength && composite_match(search_source_direct, search_source_reverse, t[at+i])) i++;
						}
					if(i==maxSuffix) {
						//direct sequence found
						found=true;
						shiftTo=at+maxSuffix;
						}
					else {
						if(DEBUG) printf("error at %d\n", i);
						maxSuffix=i;
						}
					}
				if(!found) {
					//not found
					shiftTo=at+1;
					}
				}
			}

		if(found) {
			if(fileoutput) {
				fprintf(output_file, "%i,", at);
				if(state==1) {
					fprintf(output_file, "*");
					for(int j=shiftTo-l;j<shiftTo;j++) tmp_piece[j-shiftTo+l]=base2char[t[j]];
					tmp_piece[l]='\0';
					}
				else {
					for(int j=at;j<shiftTo;j++) tmp_piece[j-at]=base2char[t[j]];
					tmp_piece[shiftTo-at]='\0';
					}
				for(int j=at;j<shiftTo;j++) fprintf(output_file, "%c", base2char[t[j]]);
				if(check_target_repetitions) fprintf(output_file, ",direct-reverse:%d-%d,targetrepetitionsdirect-reverse:%d-%d\n", last_valid_direct_count, last_valid_reverse_count, last_valid_direct_count_target, last_valid_reverse_count_target);
				else fprintf(output_file, ",direct-reverse:%d-%d\n", last_valid_direct_count, last_valid_reverse_count);
				covering_pieces.insert(std::string(tmp_piece));
				}
			coverage+=shiftTo-at;
			state=1;
			}
		else state=0;

		at=shiftTo;
		if(at>=trueStringLength) end=true;
		}

//	printf("\n\ncoverage: %d/%d\n", coverage, trueStringLength-1);
	printf("%f\n", (double(coverage))/((double)(trueStringLength-1)));

	if(fileoutput) {
		fprintf(output_file, "\ncoverage: %d/%d\n", coverage, trueStringLength-1);
		fprintf(output_file, "%f\n", (double(coverage))/((double)(trueStringLength-1)));
		if(fclose(output_file)!=0) {
			fprintf(stderr, "failed fclose of output");
			exit(EXIT_FAILURE);
			}

		int search_index;
		for(covering_pieces_ite=covering_pieces.begin();covering_pieces_ite!=covering_pieces.end();covering_pieces_ite++) {
			fprintf(output_file_pieces, "SEGMENT: %s\n", covering_pieces_ite->data());

			fprintf(output_file_pieces, "sources direct occurrences by starting index ");
			initialize_new_match(search_source_direct);
			search_index=0;
			while((*covering_pieces_ite)[search_index]!='\0') search_source_direct->current_count=single_match(search_source_direct, char2base[(int)(*covering_pieces_ite)[search_index++]]);
			fprintf(output_file_pieces, "(%d) :", search_source_direct->current_count);
			if(search_source_direct->current_count>0) {
				if(search_source_direct->current_node->leafId==0) {
					//internal node
					for(int i=search_source_direct->current_node->firstLeaf;i<=search_source_direct->current_node->lastLeaf;i++) {
						fprintf(output_file_pieces, " %d", st_root->lastLeaf-st_leaves[i]->length);
						}
					}
				else {
					//leaf
					fprintf(output_file_pieces, " %d", st_root->lastLeaf-search_source_direct->current_node->length);
					}
				}
			fprintf(output_file_pieces, "\n");

			fprintf(output_file_pieces, "sources reverse occurrences by starting index ");
			initialize_new_match(search_source_reverse);
			search_index=0;
			while((*covering_pieces_ite)[search_index]!='\0') search_source_reverse->current_count=single_match(search_source_reverse, char2base[(int)(*covering_pieces_ite)[search_index++]]);
			fprintf(output_file_pieces, "(%d) :", search_source_reverse->current_count);
			if(search_source_reverse->current_count>0) {
				if(search_source_reverse->current_node->leafId==0) {
					//internal node
					for(int i=search_source_reverse->current_node->firstLeaf;i<=search_source_reverse->current_node->lastLeaf;i++) {
						fprintf(output_file_pieces, " %d", st_root->lastLeaf-st_leaves[i]->length);
						}
					}
				else {
					//leaf
					fprintf(output_file_pieces, " %d", st_root->lastLeaf-search_source_reverse->current_node->length);
					}
				}
			fprintf(output_file_pieces, "\n");

			if(check_target_repetitions) {
				fprintf(output_file_pieces, "target direct occurrences by starting index ");
				initialize_new_match(search_target_direct);
				search_index=0;
				while((*covering_pieces_ite)[search_index]!='\0') search_target_direct->current_count=single_match(search_target_direct, char2base[(int)(*covering_pieces_ite)[search_index++]]);
				fprintf(output_file_pieces, "(%d) :", search_target_direct->current_count);
				if(search_target_direct->current_count>0) {
					if(search_target_direct->current_node->leafId==0) {
						//internal node
						for(int i=search_target_direct->current_node->firstLeaf;i<=search_target_direct->current_node->lastLeaf;i++) {
							fprintf(output_file_pieces, " %d", st2_root->lastLeaf-st_leaves[i]->length);
							}
						}
					else {
						//leaf
						fprintf(output_file_pieces, " %d", st2_root->lastLeaf-search_target_direct->current_node->length);
						}
					}
				fprintf(output_file_pieces, "\n");

				fprintf(output_file_pieces, "target reverse occurrences by starting index ");
				initialize_new_match(search_target_reverse);
				search_index=0;
				while((*covering_pieces_ite)[search_index]!='\0') search_target_reverse->current_count=single_match(search_target_reverse, char2base[(int)(*covering_pieces_ite)[search_index++]]);
				fprintf(output_file_pieces, "(%d) :", search_target_reverse->current_count);
				if(search_target_reverse->current_count>0) {
					if(search_target_reverse->current_node->leafId==0) {
						//internal node
						for(int i=search_target_reverse->current_node->firstLeaf;i<=search_target_reverse->current_node->lastLeaf;i++) {
							fprintf(output_file_pieces, " %d", st2_root->lastLeaf-st_leaves[i]->length);
							}
						}
					else {
						//leaf
						fprintf(output_file_pieces, " %d", st2_root->lastLeaf-search_target_reverse->current_node->length);
						}
					}
				fprintf(output_file_pieces, "\n");
				}

			fprintf(output_file_pieces, "\n");
			}

		if(fclose(output_file_pieces)!=0) {
			fprintf(stderr, "failed fclose of output");
			exit(EXIT_FAILURE);
			}

		delete[] tmp_piece;
		}

//	printf("\ncoverage: %d/%d\n", coverage, trueStringLength, subSeqId);
//	printf("%f (%d sequences used)\n\n", double(coverage)/double(trueStringLength-1), subSeqId);
///	printf("%f", double(coverage)/double(trueStringLength-1));
	delete[] st_leaves;
	st_free_node(st_root);
	delete[] rst_leaves;
	st_free_node(rst_root);
	delete[] s;
	delete[] rs;
	if(check_target_repetitions) {
		delete[] st2_leaves;
		st_free_node(st2_root);
		delete[] rst2_leaves;
		st_free_node(rst2_root);
		delete[] rt;
		}
	delete[] t;

//	printf("termination reached without errors\n");
	exit(EXIT_SUCCESS);
	}