Esempio n. 1
0
int aln_pair_align(FILE *fp1, FILE *fp2, AlnParam *ap, int type, int misc_flag)
{
	seq_t seq1, seq2;
	int len1, len2, n;
	char name1[MAX_NAME_LEN], name2[MAX_NAME_LEN];
	path_t *pt, *pp;
	AlnAln *aa;

	INIT_SEQ(seq1); INIT_SEQ(seq2);

	for (n = 0; ; ++n) {
		len1 = read_fasta(fp1, &seq1, name1, 0);
		len2 = read_fasta(fp2, &seq2, name2, 0);
		if (len1 < 0 || len2 < 0) break;
		aa = aln_align((char*)seq1.s, (char*)seq2.s, ap, type);
		pp = aa->path; pt = aa->path + aa->path_len - 1;
		printf(">%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\n", name1, len1, pt->i, pp->i,
				name2, len2, pt->j, pp->j, aa->score);
		if (aa->out1) printf("%s\n", aa->out1);
		if (aa->outm) printf("%s\n", aa->outm);
		if (aa->out2) printf("%s\n", aa->out2);
		if (type != ALN_BOUND_ALIGN) printf("//\n");
		fflush(stdout);
		if (misc_flag)
			aln_output_segment((char*)seq1.s, (char*)seq2.s, aa->path, aa->path_len, name1, name2);
		aln_free_AlnAln(aa);
	}
	MYFREE(seq1.s); MYFREE(seq2.s);
	return n;
}
Esempio n. 2
0
Contig
read_contig(const std::string& filename,
            const std::string& name)
{
  SeqType sequence=read_fasta(filename,name);

  return Contig(name,sequence);
}
Esempio n. 3
0
Contig
read_contig(const std::string& fasta_filename,
            const std::string& quality_filename, const std::string& name)
{
  SeqType sequence=read_fasta(fasta_filename,name);
  QualSeqType quality=read_quality(quality_filename,name);

  return Contig(name,sequence,quality);
}
Esempio n. 4
0
int main( int argc, char *argv[] ) {
    
    
    
    std::vector<std::string> names;
    std::vector<std::vector<uint8_t> > data;

//     while( std::cin.good() ) {
//         std::cout << char(std::cin.get());
//     }
//     return 0;
    
//     std::cerr << "goodx: " << std::cin.good() << std::endl;
    read_fasta( std::cin, names, data, false );
    std::cerr << "num: " << data.size() << "\n";
    size_t max_name_len = 0;
    const size_t num_col = data.at(0).size();
    std::vector<size_t> col_nongap_count( num_col );
    for( size_t i = 0; i < names.size(); ++i ) {
        
        const std::vector< uint8_t > &seq = data.at(i);
        assert( seq.size() == num_col ); // TODO: make it a real check!
        
        for( size_t j = 0; j < num_col; ++j ) {
            if( seq[j] != '-' ) {
                ++col_nongap_count[j];
            }
        }
        max_name_len = std::max(names[i].size(), max_name_len);
    }

    const size_t min_nongap = names.size() / 2;
    std::vector<size_t> selected_cols;
    for( size_t i = 0; i < col_nongap_count.size(); ++i ) {
        if( col_nongap_count[i] >= min_nongap ) {
            selected_cols.push_back(i);
        }
    }
    
    
    std::cout << names.size() << " " << selected_cols.size() << "\n";
    
    for( size_t i = 0; i < names.size(); ++i ) {
        std::cout << std::setw(max_name_len + 1) << std::left << names[i];
        //std::copy( data[i].begin(), data[i].end(), std::ostream_iterator<char>(std::cout) );
        for( size_t j = 0; j < selected_cols.size(); ++j ) {
            size_t col = selected_cols[j];
            std::cout << data[i].at(col);
        }
        
        std::cout << "\n";
    }
    
}
Esempio n. 5
0
bool read_fasta(const char *filename, Sequences *seqs)
{
    FILE *infile = NULL;
    if ((infile = fopen(filename, "r")) == NULL) {
        printError("cannot read file '%s'", filename);
        return false;
    }

    bool result = read_fasta(infile, seqs);
    fclose(infile);

    return result;
}
Esempio n. 6
0
void InputStructures::BringUpReferenceData(ExtendParameters &parameters){
   
  DEBUG = parameters.program_flow.DEBUG;
  min_map_qv = parameters.MQL0;

  cout << "Loading reference." << endl;
  read_fasta(parameters.fasta, reference_contigs);
  cout << "Loaded reference. Ref length: " << reference_contigs.size() << endl;

  bam_initialize(parameters.bams);

  if (parameters.sseMotifsProvided) {
    cout << "Loading systematic error contexts." << endl;
    read_error_motifs(parameters.sseMotifsFileName);
    cout << "Loaded." << endl;
  }
}
Esempio n. 7
0
File: readseq.c Progetto: Pency/BSPT
int read_fastaq(gzFile zfps, gzFile zfpq, SEQ_QUAL *item, int id){

	char c='\n';
	char qual[64];
	int i=0,j=0;
	int max=BUFFER_LENGTH;

	if(read_fasta(zfps, item, id) < 0)	return -1;

	if(gzgetc(zfpq) != '>'){
		if(gzeof(zfpq)) return -1;
		error_msg("sequence %d has no a FASTA format quality", id);
		return -1;
	}
	
	while(!gzeof(zfpq) && (c=gzgetc(zfpq)) != '\n');

	while(!gzeof(zfpq) && (c=gzgetc(zfpq)) != '>'){
		if(c != '\n'){
			if(c != ' '){
				qual[j++] = c;
			}else{
				if(i+1 >= max){
					max += BUFFER_LENGTH;
					item->qual = realloc(item->qual, sizeof(char) * BUFFER_LENGTH);
				}
				qual[j]='\0';
				item->qual[i++] = atoi(qual);
				j=0;
			}
		}
	}

	if(!gzeof(zfpq))
		gzseek(zfpq,0,SEEK_CUR-1);

	item->id = id;
	item->start = 0;
	item->end = item->length - 1;
	return item->length;
}
Esempio n. 8
0
  inline bool produce(uint32_t i, sequence_list& buff) {
    stream_status& st = streams_[i];

    switch(st.type) {
    case FASTA_TYPE:
      read_fasta(st, buff);
      break;
    case FASTQ_TYPE:
      read_fastq(st, buff);
      break;
    case DONE_TYPE:
      return true;
    }

    if(st.stream->good())
      return false;

    // Reach the end of file, close current and try to open the next one
    open_next_file(st);
    return false;
  }
    int init_testsuite(void){
        ref_seq = (RefSeqP)calloc(1, sizeof(RefSeqP));
        frag_seq = (FragSeqP)calloc(1, sizeof(FragSeqP));
        frag_db = init_FSDB();


        // read in our test reference sequence
        if (read_fasta_ref(ref_seq, "tr1.fna") != 1)
            return EXIT_FAILURE;

        FILE* frag_file = fileOpen("tf.fna", "r");
        if (frag_file == NULL)
            return EXIT_FAILURE;

        while (read_fasta(frag_file, frag_seq)){
            printf("%s\n", frag_seq->id);
        }

        

        return EXIT_SUCCESS;
    }
Esempio n. 10
0
void InputStructures::BringUpReferenceData(ExtendParameters &parameters) {

    DEBUG = parameters.program_flow.DEBUG;
    min_map_qv = parameters.MQL0;
    use_SSE_basecaller = parameters.program_flow.use_SSE_basecaller;
    do_snp_realignment = parameters.program_flow.do_snp_realignment;

    cout << "Loading reference." << endl;
    read_fasta(parameters.fasta, reference_contigs);
    cout << "Loaded reference. Ref length: " << reference_contigs.size() << endl;

    // some recalibration information may be read from bam file header
    bam_initialize(parameters.bams);

    if (parameters.sseMotifsProvided) {
        cout << "Loading systematic error contexts." << endl;
        read_error_motifs(parameters.sseMotifsFileName);
        cout << "Loaded." << endl;
    }

    // Load homopolymer recalibration model
    // why is recal model using the command line directly? <-- Because the basecaller module is programmed that way.
    // initialize only if there's a model file
    if (parameters.recal_model_file_name.length()>0){
        do_recal.recalModel.Initialize(parameters.opts);
        do_recal.use_recal_model_only = true;
        do_recal.is_live = true;
    }
    
    // finally turn off recalibration if not wanted
    // even although we have a nice set of recalibration read-in.
    if (parameters.program_flow.suppress_recalibration) {
        printf("Recalibration model: suppressed\n");
        do_recal.recalModel.suppressEnabled();
        do_recal.is_live = false;
    }
}
Esempio n. 11
0
int main(int argc, char **argv)
{
  char *exeName = argv[0];
  char *seqA;
  int lenA;
  int markovOrder = 0;
  char *markovFile = NULL;
  char *markovSaveFile = NULL;

  while (1) {
    int c = getopt(argc, argv, "m:f:s:h");
    if (c==-1)
      break;
    switch (c) {
    case 'm':
      markovOrder = atoi(optarg);
      break;
    case 'f':
      markovFile = optarg;
      break;
    case 's':
      markovSaveFile = optarg;
      break;
    case 'h':
    default:
      usage(exeName);
    }
  }
	
  argc -= optind-1;
  argv += optind-1;

  if (argc != 2) {
    usage(exeName);
  } else {
    seqA = read_fasta(argv[1]);
  } 

  lenA = strlen(seqA);

  printf("# Character prediction probability for FASTA file '%s'\n", argv[1]);
  printf("# Markov order = %d\n", markovOrder);
  printf("# Column order = [%s]\n", alphabet);

  {
    int i,j;
    unsigned char seqA_i[lenA];
    DOUBLE seqA_enc[lenA][ALPHA_SIZE];

    // Convert DNA sequence to only an A G C or T
    strict_DNA_seq(seqA, lenA);

    // First convert strings to numbers representing the characters
    for (i=0; i<lenA; i++) seqA_i[i] = char2int(seqA[i]);
  

    markov_init(ALPHA_SIZE, markovOrder);
    if (markovFile)
      markov_load(markovFile);
    else
      markov_fit(lenA, seqA_i);
    
    markov_predict(lenA, seqA_i, (DOUBLE*)seqA_enc);


    for (i=0; i<lenA; i++) {
      for (j=0; j<ALPHA_SIZE; j++) {
	printf("%f ", exp2(-seqA_enc[i][j]));
      }
      printf("\n");
    }

    if (markovSaveFile) {
      FILE *f = fopen(markovSaveFile, "w");
      if (!f) {
        fprintf(stderr, "Unable to open file '%s' for writing.\n", markovSaveFile);
      } else {
        fprintf(stderr, "Saving Markov Model parameters to file '%s'\n", markovSaveFile);
        markov_save(f);
      }
    }

  }

  return 0;
}
Esempio n. 12
0
int main(int argc, char* argv[])
{
  char* myString;
  int* suffixArray;
  int stringLength;
  int i;
  ifstream inFile;
  inFile.open(argv[1]);
  Timing timehere;

  if (strcmp(argv[1], "test.dat") != 0) 
  {
    timehere.markbeg();
    if (strstr(argv[1], ".fas")[0] == '.')
    {
      read_fasta(inFile, myString, stringLength);
    }
    else
    {
      read_input(inFile, myString, stringLength);
    }
    timehere.markend();
    inFile.close();
    cout << "finish read "
	 << stringLength << " characters."<< endl;
    timehere.outtime();
  }
  else
  {
    read_input(inFile, myString, stringLength);
    inFile.close();
    cout << "finish read " 
	 << stringLength << " characters."<< endl;
  }

  timehere.markbeg();
  suffixArray = LinearSuffixSort(myString, stringLength);
  timehere.markend();
  timehere.outtime("finish suffix sort,");

  if (strcmp(argv[1], "test.dat") == 0) 
  {
    int result;
    bool pass = true;
    ifstream resultF;
    resultF.open("result.test.dat");

    cout << "Testing the Suffix Array" << endl;

    for (i = 0; i < stringLength; i++)
    {
      resultF >> result;
      if (result != suffixArray[i])
      {
	pass = false;
      }
    }
    if (pass == false)
    {
      cout << endl;
      cout << "***************" << endl;
      cout << "test has failed" << endl;
      cout << "***************" << endl;
    }
    else
    {
      cout << endl;
      cout << "******************" << endl;
      cout << "test is successful" << endl;
      cout << "******************" << endl;
    }
  }
Esempio n. 13
0
int main (int argc, char** argv)
{
	char samfile[1024]; char bamfile[1024]; char variantfile[1024]; char fastafile[1024]; char maskfile[1024];
	strcpy(samfile,"None"); strcpy(bamfile,"None"); strcpy(variantfile,"None"); strcpy(fastafile,"None"); strcpy(maskfile,"None");
	GROUPNAME = NULL;
	int readsorted = 0;
	char* sampleid = (char*)malloc(1024); sampleid[0] = '-'; sampleid[1] = '\0';
	int samplecol=10; // default if there is a single sample in the VCF file
	int i=0,variants=0,hetvariants=0;
	char** bamfilelist = NULL; int bamfiles =0; 

	logfile = NULL; fragment_file = stdout; // write fragments to this file if it is present
	for (i=1;i<argc;i+=2)
	{
		if (strcmp(argv[i],"--bam") ==0 || strcmp(argv[i],"--bamfile") ==0)        bamfiles++; 
		else if (strcmp(argv[i],"--variants") ==0)        strcpy(variantfile,argv[i+1]);
		else if (strcmp(argv[i],"--reffile") ==0 || strcmp(argv[i],"--ref") ==0)        strcpy(fastafile,argv[i+1]);
		else if (strcmp(argv[i],"--mask") ==0 || strcmp(argv[i],"--mappability") ==0)        strcpy(maskfile,argv[i+1]);
		else if (strcmp(argv[i],"--VCF") ==0 || strcmp(argv[i],"--vcf") ==0)    {     strcpy(variantfile,argv[i+1]); VCFformat =1; }
		else if (strcmp(argv[i],"--sorted") ==0)       readsorted = atoi(argv[i+1]);
		else if (strcmp(argv[i],"--mbq") ==0)       MINQ = atoi(argv[i+1]);
		else if (strcmp(argv[i],"--mmq") ==0)       MIN_MQ = atoi(argv[i+1]);
		else if (strcmp(argv[i],"--maxIS") ==0)       MAX_IS = atoi(argv[i+1]);
		else if (strcmp(argv[i],"--minIS") ==0)       MIN_IS = atoi(argv[i+1]);
		else if (strcmp(argv[i],"--PEonly") ==0)       PEONLY = 1;  // discard single end mapped reads 
		else if (strcmp(argv[i],"--indels") ==0)       PARSEINDELS = atoi(argv[i+1]);  // allow indels in hairs
		else if (strcmp(argv[i],"--pflag") ==0)      IFLAG  = atoi(argv[i+1]);  // allow indels in hairs
		else if (strcmp(argv[i],"--qvoffset") ==0)       QVoffset = atoi(argv[i+1]);
		else if (strcmp(argv[i],"--out") == 0 || strcmp(argv[i],"-o") ==0) fragment_file = fopen(argv[i+1],"w");
		else if (strcmp(argv[i],"--logfile")==0 || strcmp(argv[i],"--log") ==0) logfile = fopen(argv[i+1],"w");  
		else if (strcmp(argv[i],"--singlereads")==0) SINGLEREADS = atoi(argv[i+1]);  
		else if (strcmp(argv[i],"--maxfragments")==0) MAXFRAG = atoi(argv[i+1]);  
		else if (strcmp(argv[i],"--noquality")==0) MISSING_QV = atoi(argv[i+1]);  
		else if (strcmp(argv[i],"--triallelic")==0) TRI_ALLELIC = atoi(argv[i+1]);  
		//else if (strcmp(argv[i],"--fosmids") == 0 || strcmp(argv[i],"--fosmid") ==0) FOSMIDS = 1;
		//else if (strcmp(argv[i],"--prior") == 0) PRIOR = atoi(argv[i+1]); 
		//else if (strcmp(argv[i],"--comparephase") == 0 || strcmp(argv[i],"--compare") ==0) COMPARE_PHASE = atoi(argv[i+1]); 
		else if (strcmp(argv[i],"--groupname") == 0) 
		{
			GROUPNAME = (char*)malloc(1024); strcpy(GROUPNAME,argv[i+1]); 
		}
	}
	if (bamfiles > 0 && strcmp(variantfile,"None") !=0)
	{
		bamfilelist = (char**)malloc(sizeof(char*)*bamfiles); 
		for (i=0;i<bamfiles;i++) bamfilelist[i] = (char*)malloc(1024);
		bamfiles=0;
		for (i=1;i<argc;i+=2)
		{
			if (strcmp(argv[i],"--bam") ==0 || strcmp(argv[i],"--bamfile") ==0)     strcpy(bamfilelist[bamfiles++],argv[i+1]);
		}
		fprintf(stderr,"\n extracting haplotype informative reads from bamfiles %s minQV %d minMQ %d maxIS %d \n\n",bamfilelist[0],MINQ,MIN_MQ,MAX_IS);
	}
	else
	{
		print_options(); return -1;
	}

	HASHTABLE ht; ht.htsize = 7919;  init_hashtable(&ht);
	VARIANT* varlist;
	int chromosomes=0;

	if (VCFformat ==1)
	{
		variants = count_variants(variantfile,sampleid,&samplecol); 
		if (variants < 0) return -1; 
		varlist = (VARIANT*)malloc(sizeof(VARIANT)*variants);
		chromosomes = read_variantfile(variantfile,varlist,&ht,&hetvariants,samplecol); 
	}
	else
	{
		variants = count_variants_oldformat(variantfile);
		if (variants < 0) return -1; 
		varlist = (VARIANT*)malloc(sizeof(VARIANT)*variants);
		chromosomes = read_variantfile_oldformat(variantfile,varlist,&ht,variants);
	}
	// variants is set to hetvariants only, but this is not correct since 
	VARIANTS = variants;  
	// there are two options, we include all variants in the chromvars datastructure but only use heterozygous variants for outputting HAIRS 
	// variant-id should correspond to line-number in VCF file since that will be used for printing out variants in Hapcut 

	//	fprintf(stderr,"read %d variants from file %s chromosomes %d\n",snps,argv[1],chromosomes);
	CHROMVARS* chromvars  = (CHROMVARS*)malloc(sizeof(CHROMVARS)*chromosomes);
	build_intervalmap(chromvars,chromosomes,varlist,VARIANTS);

	// read reference fasta file for INDELS, currently reads entire genome in one go, need to modify to read chromosome by chromosome 
	REFLIST* reflist = (REFLIST*)malloc(sizeof(REFLIST)); 
	reflist->ns = 0; reflist->names = NULL; reflist->lengths = NULL; reflist->sequences = NULL; reflist->current = -1;
	if (strcmp(fastafile,"None") != 0)
	{
		if (read_fastaheader(fastafile,reflist) > 0) 
		{
			reflist->sequences = calloc(reflist->ns,sizeof(char*)); //(char**)malloc(sizeof(char*)*reflist->ns);
			if (FOSMIDS ==0)
			{
				for (i=0;i<reflist->ns;i++)
				{
					reflist->sequences[i] = calloc(reflist->lengths[i]+1,sizeof(char));
					if (i < 5) fprintf(stderr,"contig %s length %d\n",reflist->names[i],reflist->lengths[i]);
				}
				read_fasta(fastafile,reflist);
			}
			else // 10.27.14 new code to read one chromosome at a time 
			{
				fprintf(stderr,"opening fasta file %s \n",fastafile);
				reflist->fp = fopen(fastafile,"r");
			}
		}
	}
	//return 1;
	if (readsorted ==0 && bamfiles > 0)
	{
		for (i=0;i<bamfiles;i++) 
		{
			if (FOSMIDS ==0) parse_bamfile_sorted(bamfilelist[i],&ht,chromvars,varlist,reflist);
			//else parse_bamfile_fosmid(bamfilelist[i],&ht,chromvars,varlist,reflist,maskfile); // fosmid pool bam file 
		}
	}
	if (logfile != NULL) fclose(logfile);
	if (fragment_file != NULL && fragment_file != stdout) fclose(fragment_file);


	// need to free up all memory before we exit the program 
	/*
	int xor = pow(2,16)-1;
	for (i=0;i<variants;i++)
	{
		//if (varlist[i].type ==0) continue;
		if (varlist[i].genotype[0] == varlist[i].genotype[2]) continue;
		fprintf(stdout,"variant %d %s %d %d %s %s %d:%d %d:%d \n",i+1,varlist[i].genotype,varlist[i].position-1,varlist[i].type,varlist[i].RA,varlist[i].AA,varlist[i].A1>>16,varlist[i].A1 & xor,varlist[i].A2>>16,varlist[i].A2 & xor);
	}
	*/
	return 0;
}
Esempio n. 14
0
main (int argc, char *argv[]) {

	int i, j, **seqs, **nall, ord=1, ns, **pij, lkf=0, npt=0, pnew=0, anc=0;
	int tcat=1, rcat=0, verb=1, miss=0, *flocs;

	int sw_flag=0, moment_flag=0, rmin_flag=0, sim_flag=0, test_flag=0;
	char fname[MAXNAME+1], **seqnames;
	long seed=-setseed();
	extern int sizeofpset;
	double *locs;

	double **lkmat, *lkres;
	FILE *ifp=NULL, *ifp2=NULL, *ifp3=NULL, *tfp;
	struct site_type **pset;
	struct data_sum *data;
	int ask_questions = 1;
	char *in_str;

	print_help(argc, argv);
	idum = &seed;
	data = malloc((size_t) sizeof(struct data_sum));
	data->exact = 0;
	strcpy(data->prefix, "");

	for(i = 0; i < argc; i++)
	{
		if(*argv[i] == '-')
		{ 
			in_str = argv[i];
			ask_questions = 0;
			if(strcmp(in_str, "-seq") == 0) ifp = fopen(argv[i+1], "r");		
			if(strcmp(in_str, "-loc") == 0) ifp2 = fopen(argv[i+1], "r");
			if(strcmp(in_str, "-lk") == 0) 
			{
				lkf = 1;
				ifp3 = fopen(argv[i+1], "r");
			}
			if(strcmp(in_str, "-exact") == 0) data->exact = 1;
			if(strcmp(in_str, "-concise") == 0) verb=0;
			if(strcmp(in_str, "-window") == 0) sw_flag=1;
			if(strcmp(in_str, "-moment") == 0) moment_flag=1;
			if(strcmp(in_str, "-simulate") == 0) sim_flag=1;
			if(strcmp(in_str, "-rmin_flag") == 0) rmin_flag=2;
			if(strcmp(in_str, "-test") == 0) test_flag=1;
			if(strcmp(in_str, "-prefix") == 0) strcpy(data->prefix, argv[i+1]);
		}
	}
	if (ifp == NULL) 
	{
		printf("\nCould not find seqs file in command line.\n");
		printf("\nInput filename for seqs:\n");
		scanf("%s", &fname);
		ifp = fopen(fname, "r");
	}
	if (ifp == NULL) nrerror("Error in opening sequence file");

	
	fscanf(ifp,"%i%i%i", &data->nseq, &data->lseq, &data->hd);
	if ((data->nseq < 2) || (data->lseq < 2)) {printf("\n\nInsufficient data for analysis (n > 1, L > 1) \n\n"); exit(1);}
	if (data->nseq > SEQ_MAX) {printf("\n\nMore than max no. sequences: Using first %i for analysis\n\n", SEQ_MAX); data->nseq=SEQ_MAX;}
	printf("\nAnalysing %i (n=%i) sequences of length %i seg sites\n", data->nseq, data->hd, data->lseq);
	seqs = imatrix(1, data->nseq, 1, data->lseq);
    seqnames = cmatrix(1, data->nseq+11, 1, MAXNAME+11);
	if (read_fasta(seqs, ifp, data->nseq, data->lseq, seqnames)) printf("\nSequences read succesfully\n");
    fclose(ifp);

	nall = imatrix(1, data->lseq, 1, 6);
	allele_count(seqs, data->nseq, data->lseq, nall,1, data->hd, data->prefix);

	/*Store lnfac values in array for speed of computation*/

	lnfac_array = (double *) malloc((size_t) ((int) (data->nseq+2)*(data->hd))*sizeof(double));

	lnfac_array[0]=lnfac_array[1]=0;

	for (j=2;j<=((int) data->nseq*(data->hd));j++) lnfac_array[j]=(double) lnfac_array[j-1]+log(j);


	/*Open file with location of seg sites and read in data*/	
	if (ifp2 == NULL) 
	{
		printf("\nCould not find locs file in command line.\n");
		printf("\nInput name of file containing location of seg sites\n\n");
		scanf("%s", &fname);
		ifp2 = fopen(fname, "r");
	}

	if (ifp2 == NULL) nrerror("Cannot open loc file");
	fscanf(ifp2, "%i %lf %c", &ns, &data->tlseq, &data->lc);
	if (ns != data->lseq) nrerror("Lseq and Locs disagree");
	if ((data->lc != 'C')&&(data->lc != 'L')) nrerror("Must input linear(L)/conversion(C)");
	if (data->lc == 'C') {
	  data->avc=0;
	  while (data->avc <= 0) {
	    printf("\n\nInput average tract length for conversion model: ");scanf("%lf", &(data->avc));
	  }
	}

	locs = dvector(1, data->lseq);
	flocs = ivector(1, data->lseq); /*Array to use when simulating data*/


	for (i=1; i<=data->lseq; i++) {
		fscanf(ifp2, "%lf", &locs[i]); 
		if ((locs[i]==0)||(locs[i]>data->tlseq)) {printf("\n\nError in Loc file\n\n%lf\n", data->tlseq); exit(1);}
		if (i>1 && locs[i]<=locs[i-1]) nrerror("Error in locs file: SNPs must be montonically increasing");
	}
	printf("\nLocation of seg sites\n\n");
	for (i=1; i<=data->lseq; i++) printf("%3i   %4.2lf\n", i, locs[i]);
	fclose(ifp2);

	/*Read in likelihood file where needed*/
    if (ask_questions) 
	{
			printf("\n\nUse existing likelihood file? (yes=1, no=0):");
			scanf("%i", &lkf);  /*lkf is a flag: 1 means use existing likelihood file as starting point*/
			if (lkf) 
			{
				printf("\n\nInput name of likelihood file: ");
				scanf("%s", &fname);
				ifp3 = fopen(fname, "r");
			}
			else 
				data->exact=0;

			if (lkf == 1)
			{
				printf("\n\nIs likelihood file an exact match to data?(no=0/yes=1): ");
				scanf("%i", &data->exact);
			}
	}

	if (lkf && !ifp3) nrerror("Cannot open likelihood file");
	if (!lkf && data->hd==2) nrerror("For diploid data need complete lookup table for sequences");

	/*Store pair-types in pij matrix - classify in pair_spectrum routine*/

	data->w	= data->lseq;  /*Note for this program use all data - pair_int restricts to a smaller window*/
	pij = imatrix((int) 1,(int) data->lseq,(int) 1,(int) data->w);

	for (i=1;i<=data->lseq;i++) for (j=1;j<=data->w;j++) pij[i][j]=0;

	pset = init_pset(pset, lkf, ifp3, &npt, data);  /*Reads in type configurations from likelihood file*/

	printf("\n\n*** Calculating distribution of pair types ***\n\n");
	pset = pair_spectrum(seqs, data, nall, pset, &npt, &pnew, &miss, anc, pij);
	printf("\n\n *** Completed classification of pair types ***\n\n");

	if (data->exact && (pnew || miss)) nrerror("Lookup table is not exact for sequences\n(possibly generated by interval)");
	printf("\n\nOld = %i: New = %i: Missing = %i\n\n", npt,pnew,miss);
	data->ptt = (int) npt+pnew+miss;  /*npt is number from likelihood file, pnew is number new with no missing data, miss is # new with missing data*/
	if (verb) {
		strcpy(fname, data->prefix);
		tfp = fopen(strcat(fname, "type_table.txt"), "w");
		if (!tfp) nrerror("Cannot open type file");
		type_print(pij, data->lseq, data->w,tfp);
		fclose(tfp);
	}
	if (verb) print_pairs(stdout, pset, npt+pnew, data->hd, data->nseq);

	/*Need a complete set for missing data or diploid data - check this*/
	if (!data->exact && (data->hd ==2 || miss)) {
		printf("\n\nMissing data or diploid: checking that likelihood table is exhaustive\n\n");
		check_exhaustive(pset,npt,(data->nseq)*((int) data->hd));
	}
	/*Read parameters and likelihoods from likelihood file - where appropriate*/
	if (lkf) {
		read_pars(ifp3, &tcat, &data->th, &data->rcat, &data->rmax);
		lkmat = dmatrix(1,npt+pnew+miss,1,data->rcat);
		if (lkf) read_lk(ifp3, lkmat, npt, tcat, data->rcat);
	}

	/*If haploid, but novel types, need to calculate new likelihoods and input parameter values*/
	if (data->hd ==1 && pnew) { /*Note can have pnew for diploid data, but this has been checked for already*/
		if (!lkf) {
			data->th=data->rmax=-1.0; data->rcat=0;
			printf("\n\nInput theta per site (suggest Watterson estimate of %.5lf):",(double) data->lseq/(watterson(data->nseq*data->hd)*data->tlseq));
			while (data->th<0.0) scanf("%lf", &data->th);
			printf("\n\nMax 4Ner for grid (suggest 100):");
			while(data->rmax<0.0) scanf("%lf", &data->rmax);
			printf("\n\nNumber of points on grid (suggest 101, min=2):");
			while(data->rcat<2) scanf("%i", &data->rcat);
			lkmat = dmatrix(1,npt+pnew+miss,1,data->rcat);
		}
		lk_est(pset,npt,pnew,lkmat,data->th,data->rcat,data->rmax);
		data->exact=1;
	}

	/*Sum over missing data or resolve genotypes and sum over missing data+configurations*/
	else if (miss && data->hd==1) {  
		printf("\n\n*** Calculating likelihoods for missing data ***\n\n");
		for (i=1;i<=miss;i++) {
			lk_miss(pset[npt+i],lkmat[npt+i],lkmat,data);
			printf("\rType %i", i);
		}

		printf("  ...Done!\n\n");
	}


	/*Sum over resolutions for diploid data*/
	else if (data->hd==2 && !data->exact) {
	  printf("\n\n*** Resolving diploid data: %i ***\n\n",pnew+miss);
	  lkres = dvector(1,data->rcat);
	  for (i=1;i<=pnew+miss;i++) {
	    lk_resolve(lkres,pset[npt+i],lkmat[npt+i],lkmat,data);
	    printf("\rType %i", i); 
	  }
	  free_dvector(lkres,1,data->rcat); 

	  printf("  ...Done!\n\n");
	}

	/*If new likelihood generated can output likelihood file for future analyses*/
	if (verb) print_lks(pset, data, npt+pnew+miss, lkmat);


	/*Basic analysis - estimation of 4Ner asuming constant rate*/

	data->rme=data->rmax; data->rce=data->rcat;
	if (1) {
		printf("\n\nDo you wish to change grid over which to estimate likelihoods for (default = %i points, 4Ner 0 - %.1lf) (1/0) :",data->rcat,data->rmax);
		scanf("%i", &lkf);
		if (lkf) {
			data->rme=-10; data->rce=0;
			printf("\n\nMax 4Ner for estimation           : ");
			while (data->rme < 0.0) scanf("%lf", &data->rme);  
       		printf("\n\nNumber of classes to estimate for: ");
       		while (data->rce < 1) scanf("%i", &data->rce);
		}
	}
	data->lksurf = dmatrix(1,data->rce,1,2);
	lk_surf(pset, pij, data, lkmat, data->th, locs, 1);


	/*Print marginal likelihood ratio test statistics for each pair of sites*/
	printf("\n\nCalculating fits\n\n");
	fit_pwlk(data,pij,locs,lkmat,verb);

	/*Sliding windows version*/
	if (1) {
		printf("\n\nDo you wish to carry out a sliding windows analysis? (yes=1/no=0):");
		scanf("%i", &sw_flag);
	}
	if (sw_flag) lk_win(pset,pij,data,lkmat,locs,nall);

	/*Nonparametric estimation of recombination rate*/
	if (1) {
		printf("\n\nPrint out table of Rmin values?\n(0=No, 1=Total only, 2=Full table):");
		scanf("%i", &rmin_flag);
	}

	if (rmin_flag) {
		rmin(data, pset, pij, locs, lkf-1);
		printf("\n\nLower bound on Rmin = %i\n\n",data->rmin);
	}

	/*Estimate 4Ner by Wakeley 1997 method*/
	if (1) {
		printf("\n\nEstimate 4Ner by moment method? (yes=1, no=0)");
		scanf("%i", &moment_flag);
	}

	if (moment_flag) wakeley_est(data, seqs, locs);

	/*Recombination tests - only available for haploid data!*/
	if (data->hd==1) {
		if (1) {
			printf("\n\nDo you wish to test for recombination? (yes=1, no=0): ");
			scanf("%i", &test_flag);
		}
		if (test_flag) {
			rec_test(data, pij, locs, lkmat, pset, npt+pnew+miss);
		}
	}

	/*Conditional simulation - only available for haploid data with a complete lk file*/
	if (data->hd==1 && !(data->exact)) {

		if (1) {
	  printf("\n\nDo you wish to test constant-rate model and estimate sampling distribution by simulation? (yes=1/no=0): ");
	  scanf("%i", &test_flag);
		}
	  if (test_flag) {
	    freq_min(locs, flocs, nall, data);
	    printf("\n\nHow many simulations? ");
	    scanf("%i", &lkf);
	    snp_sim(locs, flocs, pset, lkmat, lkf, data);
	  }
	}

	free_imatrix(pij,1,data->lseq,1,data->w);
	free_imatrix(seqs,1,data->nseq,1,data->lseq);
	free_imatrix(nall,1,data->lseq,1,5);
	for (i=1;i<sizeofpset;i++) free(pset[i]);
	free(pset);
	free(data);
	free_dvector(locs, 1, data->lseq);
	free_ivector(flocs, 1, data->lseq);

	/* system("PAUSE"); */
}
Esempio n. 15
0
int
km_coffee_align3(char *seq_f, int k, int k_leaf, char *method, char *aln_f, int n_cores, int gapopen, int gapext, char *init)
{
	char *use_as_temp = get_tmp_4_tcoffee();

	#ifdef _OPENMP
		omp_set_num_threads(n_cores);
	#endif

	SeqSet *seq_set = read_fasta(seq_f);
	qsort(seq_set->seqs, seq_set->n_seqs, sizeof(Seq*), my_seq_sort);
	srand(time(0));


	short j = -1;
	short i;
	/****************************************************
	Sequences to vector using k-mers
	*****************************************************/
	short alphabet[256];

	// standard alphabet
	for (i = 65; i < 91; ++i)
		if ((i==66) || (i==74) || (i==79) || (i==88) || (i==90))
			alphabet[i] = 0;
		else
			alphabet[i] = ++j;
	j=-1;
	for (i = 97; i < 123; ++i)
		if ((i==98) || (i==106) || (i==111) || (i==120) || (i==122))
			alphabet[i] = 0;
		else
			alphabet[i] = ++j;

	// shrinked alphabet
//	for (i = 0; i < 256; ++i)
//		alphabet[i] = 0;

// 	char *groups[]={"LlVvIiMmCcAaGgSsTtPpFfYyWw","EeDdNnQqKkRrHh"};
//	char *groups[]={"LlVvIiMmCc","AaGgSsTtPp","FfYyWw","EeDdNnQqKkRrHh"};
//	size_t n_groups = 4;
// 	size_t len;
// 	char *group;
// 	for (i=0; i<n_groups; ++i)
// 	{
// 		group=groups[i];
// 		len=strlen(group);
// 		for (j=0; j<len; ++j)
// 			alphabet[group[j]]=i+1;
// 	}

	VectorSet *vec_set = seqset2vecs_kmer(seq_set, 2, 21, alphabet);


	/****************************************************
		Sequences to vector using distances
	*****************************************************/
	// 	char *groups[]={"LVIMC","AGSTP","FYW","EDNQKRH"};
// 	size_t n_groups = 4;
// 	char *groups[]={"LlVvIiMmCc","AaGgSsTtPp","FfYyWw","EeDdNnQqKkRrHh"};
// 	char *groups[]={"LlVvIiMmCcAaGgSsTtPpFfYyWw","EeDdNnQqKkRrHh"};
// 	size_t n_groups = 2;
// 	VectorSet *vec_set = seqset2vecs_whatever(seq_set, groups, n_groups);






// 	char vec_file[500];
// 	sprintf(vec_file, "%s_2_8_%li_%li.txt", strrchr(seq_f, '/')+1, vec_set->n_vecs, vec_set->dim);

// 	print_vecs(vec_set, &vec_file[0]);
// 	read_vecs(vec_set, "matrix_59");
// 	exit(1);
//	normalize(vec_set);
	KM_node *root = hierarchical_kmeans(vec_set, k, k_leaf, init, 0.001);
// 	KM_node *root = simple_clust(vec_set, k);


	char templatee[400];
	sprintf(templatee, "%s/km_coffee_tmp_XXXXXX", use_as_temp);
	char tmp_str[FILENAME_MAX];
	km_cwd = getcwd(tmp_str, FILENAME_MAX);

	km_tmp_dir = my_make_temp_dir(templatee, "main");
	chdir(km_tmp_dir);
	char out_f[500];
	if (aln_f[0] != '/')
		sprintf(out_f, "%s/%s", km_cwd, aln_f);
	else
		sprintf(out_f, "%s", aln_f);



	size_t n_vecs = seq_set->n_seqs;
	int *assignment = (int*)malloc(n_vecs*sizeof(int));
	size_t l;
	for (l = 0; l< n_vecs; ++l)
		assignment[l]=vec_set->vecs[l]->id;

// 	printf("TRAVERSE\n");
	delVecSet(vec_set);
	traverse_km_tree(root, assignment, seq_set, out_f, n_cores, gapopen, gapext, method);
	free( assignment);
	del_tree(root);
	delSeqSet(seq_set);

	free(km_tmp_dir);




	return EXIT_SUCCESS;
}