int aln_pair_align(FILE *fp1, FILE *fp2, AlnParam *ap, int type, int misc_flag) { seq_t seq1, seq2; int len1, len2, n; char name1[MAX_NAME_LEN], name2[MAX_NAME_LEN]; path_t *pt, *pp; AlnAln *aa; INIT_SEQ(seq1); INIT_SEQ(seq2); for (n = 0; ; ++n) { len1 = read_fasta(fp1, &seq1, name1, 0); len2 = read_fasta(fp2, &seq2, name2, 0); if (len1 < 0 || len2 < 0) break; aa = aln_align((char*)seq1.s, (char*)seq2.s, ap, type); pp = aa->path; pt = aa->path + aa->path_len - 1; printf(">%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\n", name1, len1, pt->i, pp->i, name2, len2, pt->j, pp->j, aa->score); if (aa->out1) printf("%s\n", aa->out1); if (aa->outm) printf("%s\n", aa->outm); if (aa->out2) printf("%s\n", aa->out2); if (type != ALN_BOUND_ALIGN) printf("//\n"); fflush(stdout); if (misc_flag) aln_output_segment((char*)seq1.s, (char*)seq2.s, aa->path, aa->path_len, name1, name2); aln_free_AlnAln(aa); } MYFREE(seq1.s); MYFREE(seq2.s); return n; }
Contig read_contig(const std::string& filename, const std::string& name) { SeqType sequence=read_fasta(filename,name); return Contig(name,sequence); }
Contig read_contig(const std::string& fasta_filename, const std::string& quality_filename, const std::string& name) { SeqType sequence=read_fasta(fasta_filename,name); QualSeqType quality=read_quality(quality_filename,name); return Contig(name,sequence,quality); }
int main( int argc, char *argv[] ) { std::vector<std::string> names; std::vector<std::vector<uint8_t> > data; // while( std::cin.good() ) { // std::cout << char(std::cin.get()); // } // return 0; // std::cerr << "goodx: " << std::cin.good() << std::endl; read_fasta( std::cin, names, data, false ); std::cerr << "num: " << data.size() << "\n"; size_t max_name_len = 0; const size_t num_col = data.at(0).size(); std::vector<size_t> col_nongap_count( num_col ); for( size_t i = 0; i < names.size(); ++i ) { const std::vector< uint8_t > &seq = data.at(i); assert( seq.size() == num_col ); // TODO: make it a real check! for( size_t j = 0; j < num_col; ++j ) { if( seq[j] != '-' ) { ++col_nongap_count[j]; } } max_name_len = std::max(names[i].size(), max_name_len); } const size_t min_nongap = names.size() / 2; std::vector<size_t> selected_cols; for( size_t i = 0; i < col_nongap_count.size(); ++i ) { if( col_nongap_count[i] >= min_nongap ) { selected_cols.push_back(i); } } std::cout << names.size() << " " << selected_cols.size() << "\n"; for( size_t i = 0; i < names.size(); ++i ) { std::cout << std::setw(max_name_len + 1) << std::left << names[i]; //std::copy( data[i].begin(), data[i].end(), std::ostream_iterator<char>(std::cout) ); for( size_t j = 0; j < selected_cols.size(); ++j ) { size_t col = selected_cols[j]; std::cout << data[i].at(col); } std::cout << "\n"; } }
bool read_fasta(const char *filename, Sequences *seqs) { FILE *infile = NULL; if ((infile = fopen(filename, "r")) == NULL) { printError("cannot read file '%s'", filename); return false; } bool result = read_fasta(infile, seqs); fclose(infile); return result; }
void InputStructures::BringUpReferenceData(ExtendParameters ¶meters){ DEBUG = parameters.program_flow.DEBUG; min_map_qv = parameters.MQL0; cout << "Loading reference." << endl; read_fasta(parameters.fasta, reference_contigs); cout << "Loaded reference. Ref length: " << reference_contigs.size() << endl; bam_initialize(parameters.bams); if (parameters.sseMotifsProvided) { cout << "Loading systematic error contexts." << endl; read_error_motifs(parameters.sseMotifsFileName); cout << "Loaded." << endl; } }
int read_fastaq(gzFile zfps, gzFile zfpq, SEQ_QUAL *item, int id){ char c='\n'; char qual[64]; int i=0,j=0; int max=BUFFER_LENGTH; if(read_fasta(zfps, item, id) < 0) return -1; if(gzgetc(zfpq) != '>'){ if(gzeof(zfpq)) return -1; error_msg("sequence %d has no a FASTA format quality", id); return -1; } while(!gzeof(zfpq) && (c=gzgetc(zfpq)) != '\n'); while(!gzeof(zfpq) && (c=gzgetc(zfpq)) != '>'){ if(c != '\n'){ if(c != ' '){ qual[j++] = c; }else{ if(i+1 >= max){ max += BUFFER_LENGTH; item->qual = realloc(item->qual, sizeof(char) * BUFFER_LENGTH); } qual[j]='\0'; item->qual[i++] = atoi(qual); j=0; } } } if(!gzeof(zfpq)) gzseek(zfpq,0,SEEK_CUR-1); item->id = id; item->start = 0; item->end = item->length - 1; return item->length; }
inline bool produce(uint32_t i, sequence_list& buff) { stream_status& st = streams_[i]; switch(st.type) { case FASTA_TYPE: read_fasta(st, buff); break; case FASTQ_TYPE: read_fastq(st, buff); break; case DONE_TYPE: return true; } if(st.stream->good()) return false; // Reach the end of file, close current and try to open the next one open_next_file(st); return false; }
int init_testsuite(void){ ref_seq = (RefSeqP)calloc(1, sizeof(RefSeqP)); frag_seq = (FragSeqP)calloc(1, sizeof(FragSeqP)); frag_db = init_FSDB(); // read in our test reference sequence if (read_fasta_ref(ref_seq, "tr1.fna") != 1) return EXIT_FAILURE; FILE* frag_file = fileOpen("tf.fna", "r"); if (frag_file == NULL) return EXIT_FAILURE; while (read_fasta(frag_file, frag_seq)){ printf("%s\n", frag_seq->id); } return EXIT_SUCCESS; }
void InputStructures::BringUpReferenceData(ExtendParameters ¶meters) { DEBUG = parameters.program_flow.DEBUG; min_map_qv = parameters.MQL0; use_SSE_basecaller = parameters.program_flow.use_SSE_basecaller; do_snp_realignment = parameters.program_flow.do_snp_realignment; cout << "Loading reference." << endl; read_fasta(parameters.fasta, reference_contigs); cout << "Loaded reference. Ref length: " << reference_contigs.size() << endl; // some recalibration information may be read from bam file header bam_initialize(parameters.bams); if (parameters.sseMotifsProvided) { cout << "Loading systematic error contexts." << endl; read_error_motifs(parameters.sseMotifsFileName); cout << "Loaded." << endl; } // Load homopolymer recalibration model // why is recal model using the command line directly? <-- Because the basecaller module is programmed that way. // initialize only if there's a model file if (parameters.recal_model_file_name.length()>0){ do_recal.recalModel.Initialize(parameters.opts); do_recal.use_recal_model_only = true; do_recal.is_live = true; } // finally turn off recalibration if not wanted // even although we have a nice set of recalibration read-in. if (parameters.program_flow.suppress_recalibration) { printf("Recalibration model: suppressed\n"); do_recal.recalModel.suppressEnabled(); do_recal.is_live = false; } }
int main(int argc, char **argv) { char *exeName = argv[0]; char *seqA; int lenA; int markovOrder = 0; char *markovFile = NULL; char *markovSaveFile = NULL; while (1) { int c = getopt(argc, argv, "m:f:s:h"); if (c==-1) break; switch (c) { case 'm': markovOrder = atoi(optarg); break; case 'f': markovFile = optarg; break; case 's': markovSaveFile = optarg; break; case 'h': default: usage(exeName); } } argc -= optind-1; argv += optind-1; if (argc != 2) { usage(exeName); } else { seqA = read_fasta(argv[1]); } lenA = strlen(seqA); printf("# Character prediction probability for FASTA file '%s'\n", argv[1]); printf("# Markov order = %d\n", markovOrder); printf("# Column order = [%s]\n", alphabet); { int i,j; unsigned char seqA_i[lenA]; DOUBLE seqA_enc[lenA][ALPHA_SIZE]; // Convert DNA sequence to only an A G C or T strict_DNA_seq(seqA, lenA); // First convert strings to numbers representing the characters for (i=0; i<lenA; i++) seqA_i[i] = char2int(seqA[i]); markov_init(ALPHA_SIZE, markovOrder); if (markovFile) markov_load(markovFile); else markov_fit(lenA, seqA_i); markov_predict(lenA, seqA_i, (DOUBLE*)seqA_enc); for (i=0; i<lenA; i++) { for (j=0; j<ALPHA_SIZE; j++) { printf("%f ", exp2(-seqA_enc[i][j])); } printf("\n"); } if (markovSaveFile) { FILE *f = fopen(markovSaveFile, "w"); if (!f) { fprintf(stderr, "Unable to open file '%s' for writing.\n", markovSaveFile); } else { fprintf(stderr, "Saving Markov Model parameters to file '%s'\n", markovSaveFile); markov_save(f); } } } return 0; }
int main(int argc, char* argv[]) { char* myString; int* suffixArray; int stringLength; int i; ifstream inFile; inFile.open(argv[1]); Timing timehere; if (strcmp(argv[1], "test.dat") != 0) { timehere.markbeg(); if (strstr(argv[1], ".fas")[0] == '.') { read_fasta(inFile, myString, stringLength); } else { read_input(inFile, myString, stringLength); } timehere.markend(); inFile.close(); cout << "finish read " << stringLength << " characters."<< endl; timehere.outtime(); } else { read_input(inFile, myString, stringLength); inFile.close(); cout << "finish read " << stringLength << " characters."<< endl; } timehere.markbeg(); suffixArray = LinearSuffixSort(myString, stringLength); timehere.markend(); timehere.outtime("finish suffix sort,"); if (strcmp(argv[1], "test.dat") == 0) { int result; bool pass = true; ifstream resultF; resultF.open("result.test.dat"); cout << "Testing the Suffix Array" << endl; for (i = 0; i < stringLength; i++) { resultF >> result; if (result != suffixArray[i]) { pass = false; } } if (pass == false) { cout << endl; cout << "***************" << endl; cout << "test has failed" << endl; cout << "***************" << endl; } else { cout << endl; cout << "******************" << endl; cout << "test is successful" << endl; cout << "******************" << endl; } }
int main (int argc, char** argv) { char samfile[1024]; char bamfile[1024]; char variantfile[1024]; char fastafile[1024]; char maskfile[1024]; strcpy(samfile,"None"); strcpy(bamfile,"None"); strcpy(variantfile,"None"); strcpy(fastafile,"None"); strcpy(maskfile,"None"); GROUPNAME = NULL; int readsorted = 0; char* sampleid = (char*)malloc(1024); sampleid[0] = '-'; sampleid[1] = '\0'; int samplecol=10; // default if there is a single sample in the VCF file int i=0,variants=0,hetvariants=0; char** bamfilelist = NULL; int bamfiles =0; logfile = NULL; fragment_file = stdout; // write fragments to this file if it is present for (i=1;i<argc;i+=2) { if (strcmp(argv[i],"--bam") ==0 || strcmp(argv[i],"--bamfile") ==0) bamfiles++; else if (strcmp(argv[i],"--variants") ==0) strcpy(variantfile,argv[i+1]); else if (strcmp(argv[i],"--reffile") ==0 || strcmp(argv[i],"--ref") ==0) strcpy(fastafile,argv[i+1]); else if (strcmp(argv[i],"--mask") ==0 || strcmp(argv[i],"--mappability") ==0) strcpy(maskfile,argv[i+1]); else if (strcmp(argv[i],"--VCF") ==0 || strcmp(argv[i],"--vcf") ==0) { strcpy(variantfile,argv[i+1]); VCFformat =1; } else if (strcmp(argv[i],"--sorted") ==0) readsorted = atoi(argv[i+1]); else if (strcmp(argv[i],"--mbq") ==0) MINQ = atoi(argv[i+1]); else if (strcmp(argv[i],"--mmq") ==0) MIN_MQ = atoi(argv[i+1]); else if (strcmp(argv[i],"--maxIS") ==0) MAX_IS = atoi(argv[i+1]); else if (strcmp(argv[i],"--minIS") ==0) MIN_IS = atoi(argv[i+1]); else if (strcmp(argv[i],"--PEonly") ==0) PEONLY = 1; // discard single end mapped reads else if (strcmp(argv[i],"--indels") ==0) PARSEINDELS = atoi(argv[i+1]); // allow indels in hairs else if (strcmp(argv[i],"--pflag") ==0) IFLAG = atoi(argv[i+1]); // allow indels in hairs else if (strcmp(argv[i],"--qvoffset") ==0) QVoffset = atoi(argv[i+1]); else if (strcmp(argv[i],"--out") == 0 || strcmp(argv[i],"-o") ==0) fragment_file = fopen(argv[i+1],"w"); else if (strcmp(argv[i],"--logfile")==0 || strcmp(argv[i],"--log") ==0) logfile = fopen(argv[i+1],"w"); else if (strcmp(argv[i],"--singlereads")==0) SINGLEREADS = atoi(argv[i+1]); else if (strcmp(argv[i],"--maxfragments")==0) MAXFRAG = atoi(argv[i+1]); else if (strcmp(argv[i],"--noquality")==0) MISSING_QV = atoi(argv[i+1]); else if (strcmp(argv[i],"--triallelic")==0) TRI_ALLELIC = atoi(argv[i+1]); //else if (strcmp(argv[i],"--fosmids") == 0 || strcmp(argv[i],"--fosmid") ==0) FOSMIDS = 1; //else if (strcmp(argv[i],"--prior") == 0) PRIOR = atoi(argv[i+1]); //else if (strcmp(argv[i],"--comparephase") == 0 || strcmp(argv[i],"--compare") ==0) COMPARE_PHASE = atoi(argv[i+1]); else if (strcmp(argv[i],"--groupname") == 0) { GROUPNAME = (char*)malloc(1024); strcpy(GROUPNAME,argv[i+1]); } } if (bamfiles > 0 && strcmp(variantfile,"None") !=0) { bamfilelist = (char**)malloc(sizeof(char*)*bamfiles); for (i=0;i<bamfiles;i++) bamfilelist[i] = (char*)malloc(1024); bamfiles=0; for (i=1;i<argc;i+=2) { if (strcmp(argv[i],"--bam") ==0 || strcmp(argv[i],"--bamfile") ==0) strcpy(bamfilelist[bamfiles++],argv[i+1]); } fprintf(stderr,"\n extracting haplotype informative reads from bamfiles %s minQV %d minMQ %d maxIS %d \n\n",bamfilelist[0],MINQ,MIN_MQ,MAX_IS); } else { print_options(); return -1; } HASHTABLE ht; ht.htsize = 7919; init_hashtable(&ht); VARIANT* varlist; int chromosomes=0; if (VCFformat ==1) { variants = count_variants(variantfile,sampleid,&samplecol); if (variants < 0) return -1; varlist = (VARIANT*)malloc(sizeof(VARIANT)*variants); chromosomes = read_variantfile(variantfile,varlist,&ht,&hetvariants,samplecol); } else { variants = count_variants_oldformat(variantfile); if (variants < 0) return -1; varlist = (VARIANT*)malloc(sizeof(VARIANT)*variants); chromosomes = read_variantfile_oldformat(variantfile,varlist,&ht,variants); } // variants is set to hetvariants only, but this is not correct since VARIANTS = variants; // there are two options, we include all variants in the chromvars datastructure but only use heterozygous variants for outputting HAIRS // variant-id should correspond to line-number in VCF file since that will be used for printing out variants in Hapcut // fprintf(stderr,"read %d variants from file %s chromosomes %d\n",snps,argv[1],chromosomes); CHROMVARS* chromvars = (CHROMVARS*)malloc(sizeof(CHROMVARS)*chromosomes); build_intervalmap(chromvars,chromosomes,varlist,VARIANTS); // read reference fasta file for INDELS, currently reads entire genome in one go, need to modify to read chromosome by chromosome REFLIST* reflist = (REFLIST*)malloc(sizeof(REFLIST)); reflist->ns = 0; reflist->names = NULL; reflist->lengths = NULL; reflist->sequences = NULL; reflist->current = -1; if (strcmp(fastafile,"None") != 0) { if (read_fastaheader(fastafile,reflist) > 0) { reflist->sequences = calloc(reflist->ns,sizeof(char*)); //(char**)malloc(sizeof(char*)*reflist->ns); if (FOSMIDS ==0) { for (i=0;i<reflist->ns;i++) { reflist->sequences[i] = calloc(reflist->lengths[i]+1,sizeof(char)); if (i < 5) fprintf(stderr,"contig %s length %d\n",reflist->names[i],reflist->lengths[i]); } read_fasta(fastafile,reflist); } else // 10.27.14 new code to read one chromosome at a time { fprintf(stderr,"opening fasta file %s \n",fastafile); reflist->fp = fopen(fastafile,"r"); } } } //return 1; if (readsorted ==0 && bamfiles > 0) { for (i=0;i<bamfiles;i++) { if (FOSMIDS ==0) parse_bamfile_sorted(bamfilelist[i],&ht,chromvars,varlist,reflist); //else parse_bamfile_fosmid(bamfilelist[i],&ht,chromvars,varlist,reflist,maskfile); // fosmid pool bam file } } if (logfile != NULL) fclose(logfile); if (fragment_file != NULL && fragment_file != stdout) fclose(fragment_file); // need to free up all memory before we exit the program /* int xor = pow(2,16)-1; for (i=0;i<variants;i++) { //if (varlist[i].type ==0) continue; if (varlist[i].genotype[0] == varlist[i].genotype[2]) continue; fprintf(stdout,"variant %d %s %d %d %s %s %d:%d %d:%d \n",i+1,varlist[i].genotype,varlist[i].position-1,varlist[i].type,varlist[i].RA,varlist[i].AA,varlist[i].A1>>16,varlist[i].A1 & xor,varlist[i].A2>>16,varlist[i].A2 & xor); } */ return 0; }
main (int argc, char *argv[]) { int i, j, **seqs, **nall, ord=1, ns, **pij, lkf=0, npt=0, pnew=0, anc=0; int tcat=1, rcat=0, verb=1, miss=0, *flocs; int sw_flag=0, moment_flag=0, rmin_flag=0, sim_flag=0, test_flag=0; char fname[MAXNAME+1], **seqnames; long seed=-setseed(); extern int sizeofpset; double *locs; double **lkmat, *lkres; FILE *ifp=NULL, *ifp2=NULL, *ifp3=NULL, *tfp; struct site_type **pset; struct data_sum *data; int ask_questions = 1; char *in_str; print_help(argc, argv); idum = &seed; data = malloc((size_t) sizeof(struct data_sum)); data->exact = 0; strcpy(data->prefix, ""); for(i = 0; i < argc; i++) { if(*argv[i] == '-') { in_str = argv[i]; ask_questions = 0; if(strcmp(in_str, "-seq") == 0) ifp = fopen(argv[i+1], "r"); if(strcmp(in_str, "-loc") == 0) ifp2 = fopen(argv[i+1], "r"); if(strcmp(in_str, "-lk") == 0) { lkf = 1; ifp3 = fopen(argv[i+1], "r"); } if(strcmp(in_str, "-exact") == 0) data->exact = 1; if(strcmp(in_str, "-concise") == 0) verb=0; if(strcmp(in_str, "-window") == 0) sw_flag=1; if(strcmp(in_str, "-moment") == 0) moment_flag=1; if(strcmp(in_str, "-simulate") == 0) sim_flag=1; if(strcmp(in_str, "-rmin_flag") == 0) rmin_flag=2; if(strcmp(in_str, "-test") == 0) test_flag=1; if(strcmp(in_str, "-prefix") == 0) strcpy(data->prefix, argv[i+1]); } } if (ifp == NULL) { printf("\nCould not find seqs file in command line.\n"); printf("\nInput filename for seqs:\n"); scanf("%s", &fname); ifp = fopen(fname, "r"); } if (ifp == NULL) nrerror("Error in opening sequence file"); fscanf(ifp,"%i%i%i", &data->nseq, &data->lseq, &data->hd); if ((data->nseq < 2) || (data->lseq < 2)) {printf("\n\nInsufficient data for analysis (n > 1, L > 1) \n\n"); exit(1);} if (data->nseq > SEQ_MAX) {printf("\n\nMore than max no. sequences: Using first %i for analysis\n\n", SEQ_MAX); data->nseq=SEQ_MAX;} printf("\nAnalysing %i (n=%i) sequences of length %i seg sites\n", data->nseq, data->hd, data->lseq); seqs = imatrix(1, data->nseq, 1, data->lseq); seqnames = cmatrix(1, data->nseq+11, 1, MAXNAME+11); if (read_fasta(seqs, ifp, data->nseq, data->lseq, seqnames)) printf("\nSequences read succesfully\n"); fclose(ifp); nall = imatrix(1, data->lseq, 1, 6); allele_count(seqs, data->nseq, data->lseq, nall,1, data->hd, data->prefix); /*Store lnfac values in array for speed of computation*/ lnfac_array = (double *) malloc((size_t) ((int) (data->nseq+2)*(data->hd))*sizeof(double)); lnfac_array[0]=lnfac_array[1]=0; for (j=2;j<=((int) data->nseq*(data->hd));j++) lnfac_array[j]=(double) lnfac_array[j-1]+log(j); /*Open file with location of seg sites and read in data*/ if (ifp2 == NULL) { printf("\nCould not find locs file in command line.\n"); printf("\nInput name of file containing location of seg sites\n\n"); scanf("%s", &fname); ifp2 = fopen(fname, "r"); } if (ifp2 == NULL) nrerror("Cannot open loc file"); fscanf(ifp2, "%i %lf %c", &ns, &data->tlseq, &data->lc); if (ns != data->lseq) nrerror("Lseq and Locs disagree"); if ((data->lc != 'C')&&(data->lc != 'L')) nrerror("Must input linear(L)/conversion(C)"); if (data->lc == 'C') { data->avc=0; while (data->avc <= 0) { printf("\n\nInput average tract length for conversion model: ");scanf("%lf", &(data->avc)); } } locs = dvector(1, data->lseq); flocs = ivector(1, data->lseq); /*Array to use when simulating data*/ for (i=1; i<=data->lseq; i++) { fscanf(ifp2, "%lf", &locs[i]); if ((locs[i]==0)||(locs[i]>data->tlseq)) {printf("\n\nError in Loc file\n\n%lf\n", data->tlseq); exit(1);} if (i>1 && locs[i]<=locs[i-1]) nrerror("Error in locs file: SNPs must be montonically increasing"); } printf("\nLocation of seg sites\n\n"); for (i=1; i<=data->lseq; i++) printf("%3i %4.2lf\n", i, locs[i]); fclose(ifp2); /*Read in likelihood file where needed*/ if (ask_questions) { printf("\n\nUse existing likelihood file? (yes=1, no=0):"); scanf("%i", &lkf); /*lkf is a flag: 1 means use existing likelihood file as starting point*/ if (lkf) { printf("\n\nInput name of likelihood file: "); scanf("%s", &fname); ifp3 = fopen(fname, "r"); } else data->exact=0; if (lkf == 1) { printf("\n\nIs likelihood file an exact match to data?(no=0/yes=1): "); scanf("%i", &data->exact); } } if (lkf && !ifp3) nrerror("Cannot open likelihood file"); if (!lkf && data->hd==2) nrerror("For diploid data need complete lookup table for sequences"); /*Store pair-types in pij matrix - classify in pair_spectrum routine*/ data->w = data->lseq; /*Note for this program use all data - pair_int restricts to a smaller window*/ pij = imatrix((int) 1,(int) data->lseq,(int) 1,(int) data->w); for (i=1;i<=data->lseq;i++) for (j=1;j<=data->w;j++) pij[i][j]=0; pset = init_pset(pset, lkf, ifp3, &npt, data); /*Reads in type configurations from likelihood file*/ printf("\n\n*** Calculating distribution of pair types ***\n\n"); pset = pair_spectrum(seqs, data, nall, pset, &npt, &pnew, &miss, anc, pij); printf("\n\n *** Completed classification of pair types ***\n\n"); if (data->exact && (pnew || miss)) nrerror("Lookup table is not exact for sequences\n(possibly generated by interval)"); printf("\n\nOld = %i: New = %i: Missing = %i\n\n", npt,pnew,miss); data->ptt = (int) npt+pnew+miss; /*npt is number from likelihood file, pnew is number new with no missing data, miss is # new with missing data*/ if (verb) { strcpy(fname, data->prefix); tfp = fopen(strcat(fname, "type_table.txt"), "w"); if (!tfp) nrerror("Cannot open type file"); type_print(pij, data->lseq, data->w,tfp); fclose(tfp); } if (verb) print_pairs(stdout, pset, npt+pnew, data->hd, data->nseq); /*Need a complete set for missing data or diploid data - check this*/ if (!data->exact && (data->hd ==2 || miss)) { printf("\n\nMissing data or diploid: checking that likelihood table is exhaustive\n\n"); check_exhaustive(pset,npt,(data->nseq)*((int) data->hd)); } /*Read parameters and likelihoods from likelihood file - where appropriate*/ if (lkf) { read_pars(ifp3, &tcat, &data->th, &data->rcat, &data->rmax); lkmat = dmatrix(1,npt+pnew+miss,1,data->rcat); if (lkf) read_lk(ifp3, lkmat, npt, tcat, data->rcat); } /*If haploid, but novel types, need to calculate new likelihoods and input parameter values*/ if (data->hd ==1 && pnew) { /*Note can have pnew for diploid data, but this has been checked for already*/ if (!lkf) { data->th=data->rmax=-1.0; data->rcat=0; printf("\n\nInput theta per site (suggest Watterson estimate of %.5lf):",(double) data->lseq/(watterson(data->nseq*data->hd)*data->tlseq)); while (data->th<0.0) scanf("%lf", &data->th); printf("\n\nMax 4Ner for grid (suggest 100):"); while(data->rmax<0.0) scanf("%lf", &data->rmax); printf("\n\nNumber of points on grid (suggest 101, min=2):"); while(data->rcat<2) scanf("%i", &data->rcat); lkmat = dmatrix(1,npt+pnew+miss,1,data->rcat); } lk_est(pset,npt,pnew,lkmat,data->th,data->rcat,data->rmax); data->exact=1; } /*Sum over missing data or resolve genotypes and sum over missing data+configurations*/ else if (miss && data->hd==1) { printf("\n\n*** Calculating likelihoods for missing data ***\n\n"); for (i=1;i<=miss;i++) { lk_miss(pset[npt+i],lkmat[npt+i],lkmat,data); printf("\rType %i", i); } printf(" ...Done!\n\n"); } /*Sum over resolutions for diploid data*/ else if (data->hd==2 && !data->exact) { printf("\n\n*** Resolving diploid data: %i ***\n\n",pnew+miss); lkres = dvector(1,data->rcat); for (i=1;i<=pnew+miss;i++) { lk_resolve(lkres,pset[npt+i],lkmat[npt+i],lkmat,data); printf("\rType %i", i); } free_dvector(lkres,1,data->rcat); printf(" ...Done!\n\n"); } /*If new likelihood generated can output likelihood file for future analyses*/ if (verb) print_lks(pset, data, npt+pnew+miss, lkmat); /*Basic analysis - estimation of 4Ner asuming constant rate*/ data->rme=data->rmax; data->rce=data->rcat; if (1) { printf("\n\nDo you wish to change grid over which to estimate likelihoods for (default = %i points, 4Ner 0 - %.1lf) (1/0) :",data->rcat,data->rmax); scanf("%i", &lkf); if (lkf) { data->rme=-10; data->rce=0; printf("\n\nMax 4Ner for estimation : "); while (data->rme < 0.0) scanf("%lf", &data->rme); printf("\n\nNumber of classes to estimate for: "); while (data->rce < 1) scanf("%i", &data->rce); } } data->lksurf = dmatrix(1,data->rce,1,2); lk_surf(pset, pij, data, lkmat, data->th, locs, 1); /*Print marginal likelihood ratio test statistics for each pair of sites*/ printf("\n\nCalculating fits\n\n"); fit_pwlk(data,pij,locs,lkmat,verb); /*Sliding windows version*/ if (1) { printf("\n\nDo you wish to carry out a sliding windows analysis? (yes=1/no=0):"); scanf("%i", &sw_flag); } if (sw_flag) lk_win(pset,pij,data,lkmat,locs,nall); /*Nonparametric estimation of recombination rate*/ if (1) { printf("\n\nPrint out table of Rmin values?\n(0=No, 1=Total only, 2=Full table):"); scanf("%i", &rmin_flag); } if (rmin_flag) { rmin(data, pset, pij, locs, lkf-1); printf("\n\nLower bound on Rmin = %i\n\n",data->rmin); } /*Estimate 4Ner by Wakeley 1997 method*/ if (1) { printf("\n\nEstimate 4Ner by moment method? (yes=1, no=0)"); scanf("%i", &moment_flag); } if (moment_flag) wakeley_est(data, seqs, locs); /*Recombination tests - only available for haploid data!*/ if (data->hd==1) { if (1) { printf("\n\nDo you wish to test for recombination? (yes=1, no=0): "); scanf("%i", &test_flag); } if (test_flag) { rec_test(data, pij, locs, lkmat, pset, npt+pnew+miss); } } /*Conditional simulation - only available for haploid data with a complete lk file*/ if (data->hd==1 && !(data->exact)) { if (1) { printf("\n\nDo you wish to test constant-rate model and estimate sampling distribution by simulation? (yes=1/no=0): "); scanf("%i", &test_flag); } if (test_flag) { freq_min(locs, flocs, nall, data); printf("\n\nHow many simulations? "); scanf("%i", &lkf); snp_sim(locs, flocs, pset, lkmat, lkf, data); } } free_imatrix(pij,1,data->lseq,1,data->w); free_imatrix(seqs,1,data->nseq,1,data->lseq); free_imatrix(nall,1,data->lseq,1,5); for (i=1;i<sizeofpset;i++) free(pset[i]); free(pset); free(data); free_dvector(locs, 1, data->lseq); free_ivector(flocs, 1, data->lseq); /* system("PAUSE"); */ }
int km_coffee_align3(char *seq_f, int k, int k_leaf, char *method, char *aln_f, int n_cores, int gapopen, int gapext, char *init) { char *use_as_temp = get_tmp_4_tcoffee(); #ifdef _OPENMP omp_set_num_threads(n_cores); #endif SeqSet *seq_set = read_fasta(seq_f); qsort(seq_set->seqs, seq_set->n_seqs, sizeof(Seq*), my_seq_sort); srand(time(0)); short j = -1; short i; /**************************************************** Sequences to vector using k-mers *****************************************************/ short alphabet[256]; // standard alphabet for (i = 65; i < 91; ++i) if ((i==66) || (i==74) || (i==79) || (i==88) || (i==90)) alphabet[i] = 0; else alphabet[i] = ++j; j=-1; for (i = 97; i < 123; ++i) if ((i==98) || (i==106) || (i==111) || (i==120) || (i==122)) alphabet[i] = 0; else alphabet[i] = ++j; // shrinked alphabet // for (i = 0; i < 256; ++i) // alphabet[i] = 0; // char *groups[]={"LlVvIiMmCcAaGgSsTtPpFfYyWw","EeDdNnQqKkRrHh"}; // char *groups[]={"LlVvIiMmCc","AaGgSsTtPp","FfYyWw","EeDdNnQqKkRrHh"}; // size_t n_groups = 4; // size_t len; // char *group; // for (i=0; i<n_groups; ++i) // { // group=groups[i]; // len=strlen(group); // for (j=0; j<len; ++j) // alphabet[group[j]]=i+1; // } VectorSet *vec_set = seqset2vecs_kmer(seq_set, 2, 21, alphabet); /**************************************************** Sequences to vector using distances *****************************************************/ // char *groups[]={"LVIMC","AGSTP","FYW","EDNQKRH"}; // size_t n_groups = 4; // char *groups[]={"LlVvIiMmCc","AaGgSsTtPp","FfYyWw","EeDdNnQqKkRrHh"}; // char *groups[]={"LlVvIiMmCcAaGgSsTtPpFfYyWw","EeDdNnQqKkRrHh"}; // size_t n_groups = 2; // VectorSet *vec_set = seqset2vecs_whatever(seq_set, groups, n_groups); // char vec_file[500]; // sprintf(vec_file, "%s_2_8_%li_%li.txt", strrchr(seq_f, '/')+1, vec_set->n_vecs, vec_set->dim); // print_vecs(vec_set, &vec_file[0]); // read_vecs(vec_set, "matrix_59"); // exit(1); // normalize(vec_set); KM_node *root = hierarchical_kmeans(vec_set, k, k_leaf, init, 0.001); // KM_node *root = simple_clust(vec_set, k); char templatee[400]; sprintf(templatee, "%s/km_coffee_tmp_XXXXXX", use_as_temp); char tmp_str[FILENAME_MAX]; km_cwd = getcwd(tmp_str, FILENAME_MAX); km_tmp_dir = my_make_temp_dir(templatee, "main"); chdir(km_tmp_dir); char out_f[500]; if (aln_f[0] != '/') sprintf(out_f, "%s/%s", km_cwd, aln_f); else sprintf(out_f, "%s", aln_f); size_t n_vecs = seq_set->n_seqs; int *assignment = (int*)malloc(n_vecs*sizeof(int)); size_t l; for (l = 0; l< n_vecs; ++l) assignment[l]=vec_set->vecs[l]->id; // printf("TRAVERSE\n"); delVecSet(vec_set); traverse_km_tree(root, assignment, seq_set, out_f, n_cores, gapopen, gapext, method); free( assignment); del_tree(root); delSeqSet(seq_set); free(km_tmp_dir); return EXIT_SUCCESS; }