Example #1
0
void revcom_seq( char seq[], int len) {
  //int len = strlen(seq);
  char tmp_base;
  int  i;

  for (i = 0; i < len/2; i++) {
    tmp_base = seq[i];
    seq[i] = revcom_char(seq[len-(i+1)]);
    seq[len-(i+1)] = revcom_char(tmp_base);
  }

  /* If sequence length is even, we're done, otherwise there is
     the base right in the center to revcom */
  if (len%2 == 1) {
    seq[i] = revcom_char(seq[len-(i+1)]);
  }
}
/* add_virgin_fs2fsdb
   Args: (1) FragSeqP fs - pointer to a "virgin" FragSeq
         (2) FSDB fsdb - database to add this FragSeq to
   Returns: 1 if success; 0 if failue (not enough memories)
   This function is only called from sg_align; the argument
   FragSeqP points to a FragSeq for which the following is
   true: id, desc, as, ae, score, front_asp, back_asp,
   unique, and num_inputs are set to correct values.
   If trimmed is true, then this sequence is to be trimmed
   to the trim_point
   If rc is set, then this sequence is to be reverse
   complemented
   Once these operations are done, this "non-virgin" FragSeq
   is then copied into the next slot of fsdb, growing fsdb
   if necessary, and incrementing its fsdb->num_fss
*/
int add_virgin_fs2fsdb( FragSeqP fs, FSDB fsdb ) {
  int i, len, half_len;
  char tmp_b, tmp_q;

  /* Trim it? */
  if ( fs->trimmed ) {
    fs->seq[fs->trim_point + 1] = '\0';
    fs->qual[fs->trim_point + 1] = '\0';
    fs->seq_len = fs->trim_point + 1;
  }

  
  /* revcom it if it's a revcom alignment and
     we know the strand
  */
  if ( fs->rc &&
       fs->strand_known ) {
    len = fs->seq_len;
    half_len = len / 2;
    for ( i = 0; i < half_len; i++ ) {
      tmp_b = fs->seq[i];
      tmp_q = fs->qual[i];
      fs->seq[i] = revcom_char(fs->seq[len-(i+1)]);
      fs->seq[len-(i+1)] = revcom_char(tmp_b);
      fs->qual[i] = fs->qual[len-(i+1)];
      fs->qual[len-(i+1)] = tmp_q;
    }
    if ( len%2 == 1 ) {
      /* Sequence length was odd, revcom the middle base;
	 No need to adjust the quality score
       */
      fs->seq[half_len] = revcom_char(fs->seq[half_len]);
    }
  }

  /* OK, now copy it over to fsdb */
  return ( add_fs2fsdb( fs, fsdb ) );
}
Example #3
0
void revcom_PWAF(PWAlnFragP pwaln) {
  char tmp_ref, tmp_frag;
  int len, i;
  len = strlen(pwaln->ref_seq);

  for (i = 0; i < len/2; i++) {
    tmp_ref = pwaln->ref_seq[i];
    pwaln->ref_seq[i] = revcom_char(pwaln->ref_seq[len-(i+1)]);
    pwaln->ref_seq[len-(i+1)] = revcom_char(tmp_ref);
    
    tmp_frag = pwaln->frag_seq[i];
    pwaln->frag_seq[i] = revcom_char(pwaln->frag_seq[len-(i+1)]);
    pwaln->frag_seq[len-(i+1)] = revcom_char(tmp_frag);
  }
  
  /* If sequence length is even, we're done, otherwise there is
     the base right in the center to revcom */
  if (len%2 == 1) {
    pwaln->ref_seq[i] = revcom_char(pwaln->ref_seq[len-(i+1)]);
    pwaln->frag_seq[i] = revcom_char(pwaln->frag_seq[len-(i+1)]);
  }
  pwaln->revcom = 1;

}
Example #4
0
/* reiterate_assembly
   Args: (1) a pointer to a sequence to be used as the new reference
         (2) a MapAlignmentP big enough to store all the alignments
	 (3) a FSDB with sequences to be realigned
	 (4) a AlignmentP big enough for the alignments
	 (5) a front PWAlnFragP for storing front alignments
	 (6) a back PWAlnFragP for storing back alignments
	 (7) a PSSMP with the forward substitution matrices
	 (8) a PSSMP with the revcom substitution matrices
   Aligns all the FragSeqs from fsdb to the new reference, using the
   as and ae fields to narrow down where the alignment happens
   Resets the maln and writes all the results there
   Returns void
*/
void reiterate_assembly( char* new_ref_seq, int iter_num,
			 MapAlignmentP maln,
			 FSDB fsdb, AlignmentP a, 
			 PWAlnFragP front_pwaln,
			 PWAlnFragP back_pwaln, 
			 PSSMP ancsubmat,
			 PSSMP rcancsubmat ) {
  int i, j,
    ref_len,
    ref_start, 
    ref_end,
    ref_frag_len, 
    max_score,
    rc_score,
    aln_seq_len;
  FragSeqP fs;
  char iter_ref_id[MAX_ID_LEN + 1];
  char tmp_rc[INIT_ALN_SEQ_LEN + 1];
  char iter_ref_desc[] = "iteration assembly";

  /* Set up maln->ref
     Keep his seq separate from the external assembly because that
     is malloced and freed elsewhere
  */
  sprintf( iter_ref_id, "ConsAssem.%d", iter_num );
  free( maln->ref->seq );
  if ( maln->ref->rcseq != NULL ) {
    free( maln->ref->rcseq );
  }
  free( maln->ref->gaps );

  ref_len = strlen( new_ref_seq );
  maln->ref->seq = (char*)save_malloc((ref_len + 1)* sizeof(char));
  strcpy( maln->ref->seq, new_ref_seq );
  maln->ref->rcseq = NULL; // never again!
  /* Keep the ID and description the same if this is the 1st
     iteration. Otherwise, set it to the generic ones */
  if ( iter_num > 1 ) {
    strcpy( maln->ref->id, iter_ref_id );
    strcpy( maln->ref->desc, iter_ref_desc );
  }

  maln->ref->seq_len = ref_len;
  maln->ref->size = (ref_len+1);

  if ( maln->ref->circular ) {
    add_ref_wrap( maln->ref );
  }
  else {
    maln->ref->wrap_seq_len = maln->ref->seq_len;
  }
  maln->ref->gaps = 
    (int*)save_malloc((maln->ref->wrap_seq_len+1) * sizeof(int));
  for( i = 0; i <= maln->ref->wrap_seq_len; i++ ) {
    maln->ref->gaps[i] = 0;
  }

  /* Reset its AlnSeqArray ->ins to all point to null */
  for ( i = 0; i < maln->num_aln_seqs; i++ ) {
    aln_seq_len = strlen(maln->AlnSeqArray[i]->seq);
    for ( j = 0; j < aln_seq_len; j++ ) {
      /* We couldn't have malloced any sequence for
	 inserts past our length; anything non-NULL
	 out there is cruft */
      if ( maln->AlnSeqArray[i]->ins[j] != NULL ) {
	free( maln->AlnSeqArray[i]->ins[j] );
	maln->AlnSeqArray[i]->ins[j] = NULL;
      }
    }
  }

  /* Now, remake the hpcl and hprl arrays if hp_special */
  if ( a->hp ) {
    free( a->hpcl );
    free( a->hpcs );
    a->hpcl = (int*)save_malloc(maln->ref->wrap_seq_len*sizeof(int));
    a->hpcs = (int*)save_malloc(maln->ref->wrap_seq_len*sizeof(int));
    pop_hpl_and_hps( maln->ref->seq, 
		     maln->ref->wrap_seq_len,
		     a->hpcl, a->hpcs );     
  }

  /* Reset the number of aligned sequences in the maln */
  maln->num_aln_seqs = 0;

  /* OK, ref is set up. Let's go through all the sequences in fsdb
     and re-align them to the new reference. 
     If it's a revcom alignment,
     just use the rcancsubmat */
  for( i = 0; i < fsdb->num_fss; i++ ) {
    fs = fsdb->fss[i];

    /* Special case of distant reference and 
       !fs->strand_known => try to realign both strands
       against the entire reference to learn the 
       strand and alignment region
    */
    if ( maln->distant_ref &&
	 (fs->strand_known == 0 ) &&
	 (iter_num > 1) ) {
      ref_start = 0;
      ref_end = maln->ref->wrap_seq_len;
      ref_frag_len = ref_end - ref_start;
      a->seq1 = &maln->ref->seq[0];
      a->len1 = ref_frag_len;
      pop_s1c_in_a( a );
      a->seq2 = fs->seq;
      a->len2 = strlen( a->seq2 );
      pop_s2c_in_a( a );
      if ( a->hp ) {
	pop_hpl_and_hps( a->seq2, a->len2, a->hprl, a->hprs );
	pop_hpl_and_hps( a->seq1, a->len1, a->hpcl, a->hpcs );
      }
      /* Align it! */
      dyn_prog( a );
      /* Find the best forward score */
      max_score = max_sg_score( a );
      if ( max_score > FIRST_ROUND_SCORE_CUTOFF ) {
	fs->strand_known = 1;
	fs->rc = 0;
	find_align_begin( a );
	fs->as = a->abc;
	fs->ae = a->aec;
	fs->score = max_score;
      }

      /* Now, try reverse complement */
      aln_seq_len = strlen( fs->seq );
      a->submat = rcancsubmat;
      for ( j = 0; j < aln_seq_len; j++ ) {
	tmp_rc[j] = revcom_char(fs->seq[aln_seq_len-(j+1)]);
      }
      tmp_rc[aln_seq_len] = '\0';
      a->seq2 = tmp_rc;
      pop_s2c_in_a( a );
      if ( a->hp ) {
	pop_hpl_and_hps( a->seq2, a->len2, a->hprl, a->hprs );
	pop_hpl_and_hps( a->seq1, a->len1, a->hpcl, a->hpcs );
      }
      dyn_prog( a );
      max_score = max_sg_score( a );
      if ( (max_score > FIRST_ROUND_SCORE_CUTOFF) &&
	   (max_score > fs->score) ) {
	fs->strand_known = 1;
	fs->rc = 1;
	find_align_begin( a );
	fs->as = a->abc;
	fs->ae = a->aec;
	fs->score = max_score;
	strcpy( fs->seq, tmp_rc );
      }
    }

    /* Do we know the strand (either because we've always
       known it or we just learned it, doesn't matter) */
    if ( fs->strand_known ) {
      if ( fs->rc ) {
	a->submat = rcancsubmat;
      }
      else {
	a->submat = ancsubmat;
      }

      a->seq2 = fs->seq;
      a->len2 = strlen( a->seq2 );
      pop_s2c_in_a( a );

      /* Set up the alignment limits on the reference */
      if ( ((fs->as - REALIGN_BUFFER) < 0 ) ) {
	ref_start = 0;
      }
      else {
	ref_start = (fs->as - REALIGN_BUFFER);
      }
      if ( (fs->ae + REALIGN_BUFFER + 1) > 
	   maln->ref->wrap_seq_len ) {
	ref_end = maln->ref->wrap_seq_len;
      }
      else {
	ref_end = fs->ae + REALIGN_BUFFER;
      }

      /* Check to make sure the regions encompassed by ref_start to
	 ref_end is reasonable given how long this fragment is. If
	 not, just realign this whole mofo again because the reference
	 has probably changed a lot between iterations */
      if ( (ref_start + a->len2) > ref_end ) {
	ref_start = 0;
	ref_end = maln->ref->wrap_seq_len;
      }
    
      ref_frag_len = ref_end - ref_start;
      a->seq1 = &maln->ref->seq[ref_start];
      a->len1 = ref_frag_len;
      pop_s1c_in_a( a );
      
      /* If we want the homopolymer discount, the necessary arrays of
	 hp starts and lengths must be set up anew */
      if ( a->hp ) {
	pop_hpl_and_hps( a->seq2, a->len2, a->hprl, a->hprs );
	pop_hpl_and_hps( a->seq1, a->len1, a->hpcl, a->hpcs );
      }

      /* Align it! */
      dyn_prog( a );
    
      /* Find the best score */
      max_score = max_sg_score( a );

      find_align_begin( a );

      /* First, put all alignment in front_pwaln */
      populate_pwaln_to_begin( a, front_pwaln );
      
      /* Load up front_pwaln */
      strcpy( front_pwaln->ref_id, maln->ref->id );
      strcpy( front_pwaln->ref_desc, maln->ref->desc );
      
      strcpy( front_pwaln->frag_id, fs->id );
      strcpy( front_pwaln->frag_desc, fs->desc );
      
      front_pwaln->trimmed = fs->trimmed;
      front_pwaln->revcom  = fs->rc;
      front_pwaln->num_inputs = fs->num_inputs;
      front_pwaln->segment = 'a';
      front_pwaln->score = a->best_score;
  
      front_pwaln->start = a->abc + ref_start;
      front_pwaln->end   = a->aec + ref_start;

      /* Update stats for this FragSeq */
      fs->as = a->abc + ref_start;
      fs->ae = a->aec + ref_start;
      fs->unique_best = 1;
      fs->score = a->best_score;

      if ( front_pwaln->end > maln->ref->seq_len ) {
	/* This alignment wraps around - adjust the end to
	   demonstrate this for split_maln check */
	front_pwaln->end = front_pwaln->end - maln->ref->seq_len;
      }

      if ( front_pwaln->start > front_pwaln->end ) {
	/* Move wrapped bit to back_pwaln */
	split_pwaln( front_pwaln, back_pwaln, maln->ref->seq_len );
	merge_pwaln_into_maln( front_pwaln, maln );
	fs->front_asp = maln->AlnSeqArray[maln->num_aln_seqs - 1];
	merge_pwaln_into_maln( back_pwaln, maln );
	fs->back_asp = maln->AlnSeqArray[maln->num_aln_seqs - 1];
      }
      else { 
	merge_pwaln_into_maln( front_pwaln, maln );
	fs->front_asp = maln->AlnSeqArray[maln->num_aln_seqs - 1];
      }
    }
  }
  return;
}