void revcom_seq( char seq[], int len) { //int len = strlen(seq); char tmp_base; int i; for (i = 0; i < len/2; i++) { tmp_base = seq[i]; seq[i] = revcom_char(seq[len-(i+1)]); seq[len-(i+1)] = revcom_char(tmp_base); } /* If sequence length is even, we're done, otherwise there is the base right in the center to revcom */ if (len%2 == 1) { seq[i] = revcom_char(seq[len-(i+1)]); } }
/* add_virgin_fs2fsdb Args: (1) FragSeqP fs - pointer to a "virgin" FragSeq (2) FSDB fsdb - database to add this FragSeq to Returns: 1 if success; 0 if failue (not enough memories) This function is only called from sg_align; the argument FragSeqP points to a FragSeq for which the following is true: id, desc, as, ae, score, front_asp, back_asp, unique, and num_inputs are set to correct values. If trimmed is true, then this sequence is to be trimmed to the trim_point If rc is set, then this sequence is to be reverse complemented Once these operations are done, this "non-virgin" FragSeq is then copied into the next slot of fsdb, growing fsdb if necessary, and incrementing its fsdb->num_fss */ int add_virgin_fs2fsdb( FragSeqP fs, FSDB fsdb ) { int i, len, half_len; char tmp_b, tmp_q; /* Trim it? */ if ( fs->trimmed ) { fs->seq[fs->trim_point + 1] = '\0'; fs->qual[fs->trim_point + 1] = '\0'; fs->seq_len = fs->trim_point + 1; } /* revcom it if it's a revcom alignment and we know the strand */ if ( fs->rc && fs->strand_known ) { len = fs->seq_len; half_len = len / 2; for ( i = 0; i < half_len; i++ ) { tmp_b = fs->seq[i]; tmp_q = fs->qual[i]; fs->seq[i] = revcom_char(fs->seq[len-(i+1)]); fs->seq[len-(i+1)] = revcom_char(tmp_b); fs->qual[i] = fs->qual[len-(i+1)]; fs->qual[len-(i+1)] = tmp_q; } if ( len%2 == 1 ) { /* Sequence length was odd, revcom the middle base; No need to adjust the quality score */ fs->seq[half_len] = revcom_char(fs->seq[half_len]); } } /* OK, now copy it over to fsdb */ return ( add_fs2fsdb( fs, fsdb ) ); }
void revcom_PWAF(PWAlnFragP pwaln) { char tmp_ref, tmp_frag; int len, i; len = strlen(pwaln->ref_seq); for (i = 0; i < len/2; i++) { tmp_ref = pwaln->ref_seq[i]; pwaln->ref_seq[i] = revcom_char(pwaln->ref_seq[len-(i+1)]); pwaln->ref_seq[len-(i+1)] = revcom_char(tmp_ref); tmp_frag = pwaln->frag_seq[i]; pwaln->frag_seq[i] = revcom_char(pwaln->frag_seq[len-(i+1)]); pwaln->frag_seq[len-(i+1)] = revcom_char(tmp_frag); } /* If sequence length is even, we're done, otherwise there is the base right in the center to revcom */ if (len%2 == 1) { pwaln->ref_seq[i] = revcom_char(pwaln->ref_seq[len-(i+1)]); pwaln->frag_seq[i] = revcom_char(pwaln->frag_seq[len-(i+1)]); } pwaln->revcom = 1; }
/* reiterate_assembly Args: (1) a pointer to a sequence to be used as the new reference (2) a MapAlignmentP big enough to store all the alignments (3) a FSDB with sequences to be realigned (4) a AlignmentP big enough for the alignments (5) a front PWAlnFragP for storing front alignments (6) a back PWAlnFragP for storing back alignments (7) a PSSMP with the forward substitution matrices (8) a PSSMP with the revcom substitution matrices Aligns all the FragSeqs from fsdb to the new reference, using the as and ae fields to narrow down where the alignment happens Resets the maln and writes all the results there Returns void */ void reiterate_assembly( char* new_ref_seq, int iter_num, MapAlignmentP maln, FSDB fsdb, AlignmentP a, PWAlnFragP front_pwaln, PWAlnFragP back_pwaln, PSSMP ancsubmat, PSSMP rcancsubmat ) { int i, j, ref_len, ref_start, ref_end, ref_frag_len, max_score, rc_score, aln_seq_len; FragSeqP fs; char iter_ref_id[MAX_ID_LEN + 1]; char tmp_rc[INIT_ALN_SEQ_LEN + 1]; char iter_ref_desc[] = "iteration assembly"; /* Set up maln->ref Keep his seq separate from the external assembly because that is malloced and freed elsewhere */ sprintf( iter_ref_id, "ConsAssem.%d", iter_num ); free( maln->ref->seq ); if ( maln->ref->rcseq != NULL ) { free( maln->ref->rcseq ); } free( maln->ref->gaps ); ref_len = strlen( new_ref_seq ); maln->ref->seq = (char*)save_malloc((ref_len + 1)* sizeof(char)); strcpy( maln->ref->seq, new_ref_seq ); maln->ref->rcseq = NULL; // never again! /* Keep the ID and description the same if this is the 1st iteration. Otherwise, set it to the generic ones */ if ( iter_num > 1 ) { strcpy( maln->ref->id, iter_ref_id ); strcpy( maln->ref->desc, iter_ref_desc ); } maln->ref->seq_len = ref_len; maln->ref->size = (ref_len+1); if ( maln->ref->circular ) { add_ref_wrap( maln->ref ); } else { maln->ref->wrap_seq_len = maln->ref->seq_len; } maln->ref->gaps = (int*)save_malloc((maln->ref->wrap_seq_len+1) * sizeof(int)); for( i = 0; i <= maln->ref->wrap_seq_len; i++ ) { maln->ref->gaps[i] = 0; } /* Reset its AlnSeqArray ->ins to all point to null */ for ( i = 0; i < maln->num_aln_seqs; i++ ) { aln_seq_len = strlen(maln->AlnSeqArray[i]->seq); for ( j = 0; j < aln_seq_len; j++ ) { /* We couldn't have malloced any sequence for inserts past our length; anything non-NULL out there is cruft */ if ( maln->AlnSeqArray[i]->ins[j] != NULL ) { free( maln->AlnSeqArray[i]->ins[j] ); maln->AlnSeqArray[i]->ins[j] = NULL; } } } /* Now, remake the hpcl and hprl arrays if hp_special */ if ( a->hp ) { free( a->hpcl ); free( a->hpcs ); a->hpcl = (int*)save_malloc(maln->ref->wrap_seq_len*sizeof(int)); a->hpcs = (int*)save_malloc(maln->ref->wrap_seq_len*sizeof(int)); pop_hpl_and_hps( maln->ref->seq, maln->ref->wrap_seq_len, a->hpcl, a->hpcs ); } /* Reset the number of aligned sequences in the maln */ maln->num_aln_seqs = 0; /* OK, ref is set up. Let's go through all the sequences in fsdb and re-align them to the new reference. If it's a revcom alignment, just use the rcancsubmat */ for( i = 0; i < fsdb->num_fss; i++ ) { fs = fsdb->fss[i]; /* Special case of distant reference and !fs->strand_known => try to realign both strands against the entire reference to learn the strand and alignment region */ if ( maln->distant_ref && (fs->strand_known == 0 ) && (iter_num > 1) ) { ref_start = 0; ref_end = maln->ref->wrap_seq_len; ref_frag_len = ref_end - ref_start; a->seq1 = &maln->ref->seq[0]; a->len1 = ref_frag_len; pop_s1c_in_a( a ); a->seq2 = fs->seq; a->len2 = strlen( a->seq2 ); pop_s2c_in_a( a ); if ( a->hp ) { pop_hpl_and_hps( a->seq2, a->len2, a->hprl, a->hprs ); pop_hpl_and_hps( a->seq1, a->len1, a->hpcl, a->hpcs ); } /* Align it! */ dyn_prog( a ); /* Find the best forward score */ max_score = max_sg_score( a ); if ( max_score > FIRST_ROUND_SCORE_CUTOFF ) { fs->strand_known = 1; fs->rc = 0; find_align_begin( a ); fs->as = a->abc; fs->ae = a->aec; fs->score = max_score; } /* Now, try reverse complement */ aln_seq_len = strlen( fs->seq ); a->submat = rcancsubmat; for ( j = 0; j < aln_seq_len; j++ ) { tmp_rc[j] = revcom_char(fs->seq[aln_seq_len-(j+1)]); } tmp_rc[aln_seq_len] = '\0'; a->seq2 = tmp_rc; pop_s2c_in_a( a ); if ( a->hp ) { pop_hpl_and_hps( a->seq2, a->len2, a->hprl, a->hprs ); pop_hpl_and_hps( a->seq1, a->len1, a->hpcl, a->hpcs ); } dyn_prog( a ); max_score = max_sg_score( a ); if ( (max_score > FIRST_ROUND_SCORE_CUTOFF) && (max_score > fs->score) ) { fs->strand_known = 1; fs->rc = 1; find_align_begin( a ); fs->as = a->abc; fs->ae = a->aec; fs->score = max_score; strcpy( fs->seq, tmp_rc ); } } /* Do we know the strand (either because we've always known it or we just learned it, doesn't matter) */ if ( fs->strand_known ) { if ( fs->rc ) { a->submat = rcancsubmat; } else { a->submat = ancsubmat; } a->seq2 = fs->seq; a->len2 = strlen( a->seq2 ); pop_s2c_in_a( a ); /* Set up the alignment limits on the reference */ if ( ((fs->as - REALIGN_BUFFER) < 0 ) ) { ref_start = 0; } else { ref_start = (fs->as - REALIGN_BUFFER); } if ( (fs->ae + REALIGN_BUFFER + 1) > maln->ref->wrap_seq_len ) { ref_end = maln->ref->wrap_seq_len; } else { ref_end = fs->ae + REALIGN_BUFFER; } /* Check to make sure the regions encompassed by ref_start to ref_end is reasonable given how long this fragment is. If not, just realign this whole mofo again because the reference has probably changed a lot between iterations */ if ( (ref_start + a->len2) > ref_end ) { ref_start = 0; ref_end = maln->ref->wrap_seq_len; } ref_frag_len = ref_end - ref_start; a->seq1 = &maln->ref->seq[ref_start]; a->len1 = ref_frag_len; pop_s1c_in_a( a ); /* If we want the homopolymer discount, the necessary arrays of hp starts and lengths must be set up anew */ if ( a->hp ) { pop_hpl_and_hps( a->seq2, a->len2, a->hprl, a->hprs ); pop_hpl_and_hps( a->seq1, a->len1, a->hpcl, a->hpcs ); } /* Align it! */ dyn_prog( a ); /* Find the best score */ max_score = max_sg_score( a ); find_align_begin( a ); /* First, put all alignment in front_pwaln */ populate_pwaln_to_begin( a, front_pwaln ); /* Load up front_pwaln */ strcpy( front_pwaln->ref_id, maln->ref->id ); strcpy( front_pwaln->ref_desc, maln->ref->desc ); strcpy( front_pwaln->frag_id, fs->id ); strcpy( front_pwaln->frag_desc, fs->desc ); front_pwaln->trimmed = fs->trimmed; front_pwaln->revcom = fs->rc; front_pwaln->num_inputs = fs->num_inputs; front_pwaln->segment = 'a'; front_pwaln->score = a->best_score; front_pwaln->start = a->abc + ref_start; front_pwaln->end = a->aec + ref_start; /* Update stats for this FragSeq */ fs->as = a->abc + ref_start; fs->ae = a->aec + ref_start; fs->unique_best = 1; fs->score = a->best_score; if ( front_pwaln->end > maln->ref->seq_len ) { /* This alignment wraps around - adjust the end to demonstrate this for split_maln check */ front_pwaln->end = front_pwaln->end - maln->ref->seq_len; } if ( front_pwaln->start > front_pwaln->end ) { /* Move wrapped bit to back_pwaln */ split_pwaln( front_pwaln, back_pwaln, maln->ref->seq_len ); merge_pwaln_into_maln( front_pwaln, maln ); fs->front_asp = maln->AlnSeqArray[maln->num_aln_seqs - 1]; merge_pwaln_into_maln( back_pwaln, maln ); fs->back_asp = maln->AlnSeqArray[maln->num_aln_seqs - 1]; } else { merge_pwaln_into_maln( front_pwaln, maln ); fs->front_asp = maln->AlnSeqArray[maln->num_aln_seqs - 1]; } } } return; }