Exemple #1
0
bool Aligner::Align(const char* query, const Filter& filter,
                    Alignment* alignment) const
{
  if (!translation_matrix_) return false;
  if (reference_length_ == 0) return false;

  int query_len = strlen(query);
  if (query_len == 0) return false;
  int8_t* translated_query = new int8_t[query_len];
  TranslateBase(query, query_len, translated_query);

  const int8_t score_size = 2;
  s_profile* profile = ssw_init(translated_query, query_len, score_matrix_,
                                score_matrix_size_, score_size);

  uint8_t flag = 0;
  SetFlag(filter, &flag);
  s_align* s_al = ssw_align(profile, translated_reference_, reference_length_,
                                 static_cast<int>(gap_opening_penalty_),
				 static_cast<int>(gap_extending_penalty_),
				 flag, filter.score_filter, filter.distance_filter, query_len);

  alignment->Clear();
  ConvertAlignment(*s_al, query_len, alignment);
  alignment->mismatches = CalculateNumberMismatch(&*alignment, translated_reference_, translated_query, query_len);


  // Free memory
  delete [] translated_query;
  align_destroy(s_al);
  init_destroy(profile);

  return true;
}
int main (int argc, char * const argv[]) {
	clock_t start, end;
	float cpu_time;
	gzFile read_fp, ref_fp;
	kseq_t *read_seq, *ref_seq;
	int32_t l, m, k, match = 2, mismatch = 2, gap_open = 3, gap_extension = 1, path = 0, reverse = 0, n = 5, sam = 0, protein = 0, header = 0, s1 = 67108864, s2 = 128, filter = 0;
	int8_t* mata = (int8_t*)calloc(25, sizeof(int8_t));
	const int8_t* mat = mata;
	char mat_name[16];
	mat_name[0] = '\0';
	int8_t* ref_num = (int8_t*)malloc(s1);
	int8_t* num = (int8_t*)malloc(s2), *num_rc = 0;
	char* read_rc = 0;

	static const int8_t mat50[] = {
	//  A   R   N   D   C   Q   E   G   H   I   L   K   M   F   P   S   T   W   Y   V   B   Z   X   *
     	5, -2, -1, -2, -1, -1, -1,  0, -2, -1, -2, -1, -1, -3, -1,  1,  0, -3, -2,  0, -2, -1, -1, -5,	// A
       -2,  7, -1, -2, -4,  1,  0, -3,  0, -4, -3,  3, -2, -3, -3, -1, -1, -3, -1, -3, -1,  0, -1, -5,	// R
       -1, -1,  7,  2, -2,  0,  0,  0,  1, -3, -4,  0, -2, -4, -2,  1,  0, -4, -2, -3,  5,  0, -1, -5,	// N
       -2, -2,  2,  8, -4,  0,  2, -1, -1, -4, -4, -1, -4, -5, -1,  0, -1, -5, -3, -4,  6,  1, -1, -5,	// D
       -1, -4, -2, -4, 13, -3, -3, -3, -3, -2, -2, -3, -2, -2, -4, -1, -1, -5, -3, -1, -3, -3, -1, -5,	// C
       -1,  1,  0,  0, -3,  7,  2, -2,  1, -3, -2,  2,  0, -4, -1,  0, -1, -1, -1, -3,  0,  4, -1, -5,	// Q
       -1,  0,  0,  2, -3,  2,  6, -3,  0, -4, -3,  1, -2, -3, -1, -1, -1, -3, -2, -3,  1,  5, -1, -5,	// E
     	0, -3,  0, -1, -3, -2, -3,  8, -2, -4, -4, -2, -3, -4, -2,  0, -2, -3, -3, -4, -1, -2, -1, -5,	// G
       -2,  0,  1, -1, -3,  1,  0, -2, 10, -4, -3,  0, -1, -1, -2, -1, -2, -3,  2, -4,  0,  0, -1, -5,	// H
       -1, -4, -3, -4, -2, -3, -4, -4, -4,  5,  2, -3,  2,  0, -3, -3, -1, -3, -1,  4, -4, -3, -1, -5,	// I
       -2, -3, -4, -4, -2, -2, -3, -4, -3,  2,  5, -3,  3,  1, -4, -3, -1, -2, -1,  1, -4, -3, -1, -5,	// L
       -1,  3,  0, -1, -3,  2,  1, -2,  0, -3, -3,  6, -2, -4, -1,  0, -1, -3, -2, -3,  0,  1, -1, -5,	// K
       -1, -2, -2, -4, -2,  0, -2, -3, -1,  2,  3, -2,  7,  0, -3, -2, -1, -1,  0,  1, -3, -1, -1, -5,	// M
       -3, -3, -4, -5, -2, -4, -3, -4, -1,  0,  1, -4,  0,  8, -4, -3, -2,  1,  4, -1, -4, -4, -1, -5,	// F
       -1, -3, -2, -1, -4, -1, -1, -2, -2, -3, -4, -1, -3, -4, 10, -1, -1, -4, -3, -3, -2, -1, -1, -5,	// P
     	1, -1,  1,  0, -1,  0, -1,  0, -1, -3, -3,  0, -2, -3, -1,  5,  2, -4, -2, -2,  0,  0, -1, -5,	// S
    	0, -1,  0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1,  2,  5, -3, -2,  0,  0, -1, -1, -5, 	// T
       -3, -3, -4, -5, -5, -1, -3, -3, -3, -3, -2, -3, -1,  1, -4, -4, -3, 15,  2, -3, -5, -2, -1, -5, 	// W
       -2, -1, -2, -3, -3, -1, -2, -3,  2, -1, -1, -2,  0,  4, -3, -2, -2,  2,  8, -1, -3, -2, -1, -5, 	// Y
     	0, -3, -3, -4, -1, -3, -3, -4, -4,  4,  1, -3,  1, -1, -3, -2,  0, -3, -1,  5, -3, -3, -1, -5, 	// V
       -2, -1,  5,  6, -3,  0,  1, -1,  0, -4, -4,  0, -3, -4, -2,  0,  0, -5, -3, -3,  6,  1, -1, -5, 	// B
       -1,  0,  0,  1, -3,  4,  5, -2,  0, -3, -3,  1, -1, -4, -1,  0, -1, -2, -2, -3,  1,  5, -1, -5, 	// Z
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -5, 	// X
       -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,  1 	// *
	};

	/* This table is used to transform amino acid letters into numbers. */
	int8_t aa_table[128] = {
		23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
		23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
		23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
		23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
		23, 0,  20, 4,  3,  6,  13, 7,  8,  9,  23, 11, 10, 12, 2,  23,
		14, 5,  1,  15, 16, 23, 19, 17, 22, 18, 21, 23, 23, 23, 23, 23,
		23, 0,  20, 4,  3,  6,  13, 7,  8,  9,  23, 11, 10, 12, 2,  23,
		14, 5,  1,  15, 16, 23, 19, 17, 22, 18, 21, 23, 23, 23, 23, 23
	};

	/* This table is used to transform nucleotide letters into numbers. */
	int8_t nt_table[128] = {
		4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
		4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
		4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
		4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
		4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4,
		4, 4, 4, 4,  3, 3, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
		4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4,
		4, 4, 4, 4,  3, 3, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4
	};

	int8_t* table = nt_table;

	// Parse command line.
	while ((l = getopt(argc, argv, "m:x:o:e:a:f:pcrsh")) >= 0) {
		switch (l) {
			case 'm': match = atoi(optarg); break;
			case 'x': mismatch = atoi(optarg); break;
			case 'o': gap_open = atoi(optarg); break;
			case 'e': gap_extension = atoi(optarg); break;
			case 'a': strcpy(mat_name, optarg); break;
			case 'f': filter = atoi(optarg); break;
			case 'p': protein = 1; break;
			case 'c': path = 1; break;
			case 'r': reverse = 1; break;
			case 's': sam = 1; break;
			case 'h': header = 1; break;
		}
	}
	if (optind + 2 > argc) {
		fprintf(stderr, "\n");
		fprintf(stderr, "Usage: ssw_test [options] ... <target.fasta> <query.fasta>(or <query.fastq>)\n");
		fprintf(stderr, "Options:\n");
		fprintf(stderr, "\t-m N\tN is a positive integer for weight match in genome sequence alignment. [default: 2]\n");
		fprintf(stderr, "\t-x N\tN is a positive integer. -N will be used as weight mismatch in genome sequence alignment. [default: 2]\n");
		fprintf(stderr, "\t-o N\tN is a positive integer. -N will be used as the weight for the gap opening. [default: 3]\n");
		fprintf(stderr, "\t-e N\tN is a positive integer. -N will be used as the weight for the gap extension. [default: 1]\n");
		fprintf(stderr, "\t-p\tDo protein sequence alignment. Without this option, the ssw_test will do genome sequence alignment.\n");
		fprintf(stderr, "\t-a FILE\tFILE is either the Blosum or Pam weight matrix. [default: Blosum50]\n");
		fprintf(stderr, "\t-c\tReturn the alignment path.\n");
		fprintf(stderr, "\t-f N\tN is a positive integer. Only output the alignments with the Smith-Waterman score >= N.\n");
		fprintf(stderr, "\t-r\tThe best alignment will be picked between the original read alignment and the reverse complement read alignment.\n");
		fprintf(stderr, "\t-s\tOutput in SAM format. [default: no header]\n");
		fprintf(stderr, "\t-h\tIf -s is used, include header in SAM output.\n\n");
		return 1;
	}

	// initialize scoring matrix for genome sequences
	for (l = k = 0; LIKELY(l < 4); ++l) {
		for (m = 0; LIKELY(m < 4); ++m) mata[k++] = l == m ? match : -mismatch;	/* weight_match : -weight_mismatch */
		mata[k++] = 0; // ambiguous base
	}
	for (m = 0; LIKELY(m < 5); ++m) mata[k++] = 0;

	if (protein == 1 && (! strcmp(mat_name, "\0"))) {
		n = 24;
		table = aa_table;
		mat = mat50;
	} else if (strcmp(mat_name, "\0")) {

	// Parse score matrix.
		FILE *f_mat = fopen(mat_name, "r");
		char line[128];
		mata = (int8_t*)realloc(mata, 1024 * sizeof(int8_t));
		k = 0;
		m = 0;
		while (fgets(line, 128, f_mat)) {
			if (line[0] == '*' || (line[0] >= 'A' && line[0] <= 'Z')) {
				if (line[0] >= 'A' && line[0] <= 'Z') aa_table[(int)line[0]] = aa_table[(int)line[0] + 32] = m;
				char str[4], *s = str;
				str[0] = '\0';
				l = 1;
				while (line[l]) {
					if ((line[l] >= '0' && line[l] <= '9') || line[l] == '-') *s++ = line[l];
					else if (str[0] != '\0') {
						*s = '\0';
						mata[k++] = (int8_t)atoi(str);
						s = str;
						str[0] = '\0';
					}
					++l;
				}
				if (str[0] != '\0') {
					*s = '\0';
					mata[k++] = (int8_t)atoi(str);
					s = str;
					str[0] = '\0';
				}
				++m;
			}
		}
		if (k == 0) {
			fprintf(stderr, "Problem of reading the weight matrix file.\n");
			return 1;
		}
		fclose(f_mat);
		n = m;
		table = aa_table;
		mat = mata;
	}

	//fprintf(stderr, "query: %s\n", argv[optind + 1]);
	read_fp = gzopen(argv[optind + 1], "r");

    if (! read_fp) {
        fprintf (stderr, "gzopen of '%s' failed.\n", argv[optind + 1]);
            exit (EXIT_FAILURE);
    }

	read_seq = kseq_init(read_fp);
	if (sam && header && path) {
		fprintf(stdout, "@HD\tVN:1.4\tSO:queryname\n");
		ref_fp = gzopen(argv[optind], "r");
		ref_seq = kseq_init(ref_fp);
		while ((l = kseq_read(ref_seq)) >= 0) fprintf(stdout, "@SQ\tSN:%s\tLN:%d\n", ref_seq->name.s, (int32_t)ref_seq->seq.l);
		kseq_destroy(ref_seq);
		gzclose(ref_fp);
	} else if (sam && !path) {
		fprintf(stderr, "SAM format output is only available together with option -c.\n");
		sam = 0;
	}

	// alignment
	if (reverse == 1 && n == 5) {
		read_rc = (char*)malloc(s2);
		num_rc = (int8_t*)malloc(s2);
	}
	start = clock();
	while (kseq_read(read_seq) >= 0) {
		s_profile* p, *p_rc = 0;
		int32_t readLen = read_seq->seq.l;
		int32_t maskLen = readLen / 2;

		while (readLen >= s2) {
			++s2;
			kroundup32(s2);
			num = (int8_t*)realloc(num, s2);
			if (reverse == 1 && n == 5) {
				read_rc = (char*)realloc(read_rc, s2);
				num_rc = (int8_t*)realloc(num_rc, s2);
			}
		}
		for (m = 0; m < readLen; ++m) num[m] = table[(int)read_seq->seq.s[m]];
		p = ssw_init(num, readLen, mat, n, 2);
		if (reverse == 1 && n == 5) {
			reverse_comple(read_seq->seq.s, read_rc);
			for (m = 0; m < readLen; ++m) num_rc[m] = table[(int)read_rc[m]];
			p_rc = ssw_init(num_rc, readLen, mat, n, 2);
		}else if (reverse == 1 && n == 24) {
			fprintf (stderr, "Reverse complement alignment is not available for protein sequences. \n");
			return 1;
		}

		ref_fp = gzopen(argv[optind], "r");
		ref_seq = kseq_init(ref_fp);
		while (kseq_read(ref_seq) >= 0) {
			s_align* result, *result_rc = 0;
			int32_t refLen = ref_seq->seq.l;
			int8_t flag = 0;
			while (refLen > s1) {
				++s1;
				kroundup32(s1);
				ref_num = (int8_t*)realloc(ref_num, s1);
			}
			for (m = 0; m < refLen; ++m) ref_num[m] = table[(int)ref_seq->seq.s[m]];
			if (path == 1) flag = 2;
			result = ssw_align (p, ref_num, refLen, gap_open, gap_extension, flag, filter, 0, maskLen);
			if (reverse == 1 && protein == 0)
				result_rc = ssw_align(p_rc, ref_num, refLen, gap_open, gap_extension, flag, filter, 0, maskLen);
			if (result_rc && result_rc->score1 > result->score1 && result_rc->score1 >= filter) {
				if (sam) ssw_write (result_rc, ref_seq, read_seq, read_rc, table, 1, 1);
				else ssw_write (result_rc, ref_seq, read_seq, read_rc, table, 1, 0);
			}else if (result && result->score1 >= filter){
				if (sam) ssw_write(result, ref_seq, read_seq, read_seq->seq.s, table, 0, 1);
				else ssw_write(result, ref_seq, read_seq, read_seq->seq.s, table, 0, 0);
			} else if (! result) return 1;
			if (result_rc) align_destroy(result_rc);
			align_destroy(result);
		}

		if(p_rc) init_destroy(p_rc);
		init_destroy(p);
		kseq_destroy(ref_seq);
		gzclose(ref_fp);
	}
	end = clock();
	cpu_time = ((float) (end - start)) / CLOCKS_PER_SEC;
	fprintf(stderr, "CPU time: %f seconds\n", cpu_time);

	if (num_rc) {
		free(num_rc);
		free(read_rc);
	}
	kseq_destroy(read_seq);
	gzclose(read_fp);
	free(num);
	free(ref_num);
 	free(mata);
	return 0;
}