Beispiel #1
0
/* @brief	For an alignment column
 * 		- generate cons_type [SProfile] for SNP entries;
 * 		- generate [LProfile] for LP entries i.e. Dx or Ix.
 * 		- Compute lamda values [lamda_S/L], and raw probabilities in [p_S/L]
 */
void pileup_profiling (pileup_profile_t& profile_info, const col_t& col,
		const std::vector<jeb_t>& jeb, const GlobalParam& gParam) {

	int num_entry = col.entries.size();
	bool is_ins_col = (col.ref_pos % 2 == 1) ? true : false;

	for (int i = 0; i < num_entry; ++ i) {
		if (col.entries[i].cons_type.at(0) == 'D'
				|| col.entries[i].cons_type.at(0) == 'I') {
			add_to_profile (profile_info.LProfile, col.entries[i].cons_type);
		} else {
			if (is_ins_col)	add_to_profile (profile_info.LProfile, "d");
			else {
				add_to_profile (profile_info.LProfile, "i");
				add_to_profile (profile_info.SProfile, col.entries[i].cons_type);
			}
		}

		if (jeb[col.entries[i].eb_index].lp_sum != 0) { //

			//double tmp = (jeb[col.entries[i].eb_index].lp_err + 1.0)/
			//		(jeb[col.entries[i].eb_index].lp_sum + 1.0);
			//tmp = (tmp == 1.0) ? 0.5 : tmp;
			int idx = col.entries[i].eb_index;
			// constant prior
			//double tmp = (jeb[idx].lp_err + gParam.bkinfo.get_prior_L())/
			//			 (jeb[idx].lp_sum + gParam.bkinfo.average_L());
			// weighted prior
			//double tmp = (jeb[idx].lp_err * gParam.bkinfo.average_L() +
			//		jeb[idx].lp_sum * gParam.bkinfo.get_prior_L())/
			//		(2 * jeb[idx].lp_sum * gParam.bkinfo.average_L());

			//lamda_lp += std::min(tmp, gParam.lenpoly_pe); // over calling
			double pe = get_pe (jeb[idx].lp_err, jeb[idx].lp_sum,
				1.0 * gParam.bkinfo.get_prior_L()/ gParam.bkinfo.average_L());

			profile_info.lamda_L += pe;		// under calling
			profile_info.p_L.push_back(pe);
		}

		if (! is_ins_col && col.entries[i].cons_type.at(0) != 'D'
				&& jeb[col.entries[i].eb_index].snp_sum != 0) { // SC

			//double tmp = (jeb[col.entries[i].eb_index].snp_err + 1.0)/
			//			(jeb[col.entries[i].eb_index].snp_sum + 1.0);
			//tmp = (tmp == 1.0) ? 0.5 : tmp;
			int idx = col.entries[i].eb_index;
			//double tmp = (jeb[idx].snp_err + gParam.bkinfo.get_prior_S())/
			//			(jeb[idx].snp_sum + gParam.bkinfo.average_S());

			// weighted piror
			//double tmp = (jeb[idx].snp_err * gParam.bkinfo.average_S() +
			//		jeb[idx].snp_sum * gParam.bkinfo.get_prior_S())/
			//		(2 * jeb[idx].snp_sum * gParam.bkinfo.average_S());
			//lamda_snp += gParam.pe; // uniform
			//lamda_snp += std::min(gParam.pe, tmp); // over-calling

			double pe = get_pe (jeb[idx].snp_err, jeb[idx].snp_sum,
				1.0 * gParam.bkinfo.get_prior_S()/ gParam.bkinfo.average_S());
			profile_info.lamda_S += pe;
			profile_info.p_S.push_back(pe);
		}

	}// for (int i = 0
} // pileup_profiling
Beispiel #2
0
int main(int argc, char** argv) {
    //structs used for timing
    struct timeval tval_before, tval_after, tval_result;
    gettimeofday(&tval_before, NULL);

    FILE * fp;

    int option = 0;
    //parse arguments
    while((option = getopt(argc, argv, "k:d:t:")) != -1) {
        switch(option) {
            case 'k': 
                k = atoi(optarg);
                break;
            case 'd':
                d = atoi(optarg);
                break;
            case 't':
                t = atoi(optarg);
                break;
            default:
                printf("Usage: motif_finder [-k motif_size] [-d wild_cards] [-t max_time] input_file");
                break;
        }
    }

    if((fp = fopen(argv[argc-1], "r")) == NULL) {
        printf("FILE: \"%s\" DOESN'T EXIST\n", argv[argc - 1]);
        exit(1); //Failed to open file.
    }

    //count how big our buffer needs to be (every other line)
    char line[80];
    int totalSequences = 0;
    while(fgets(line, 80, fp) != NULL) {
    	if(line[0] == '>') {
    		totalSequences++;
    	}
    }
    rewind(fp);//reset fp to start of file
    //read in the sequences. account for multi-line sequences
    char* sequences[totalSequences];
    int sequenceNum = 0;
    int sequenceLength = 0;
    int beginningOfSequence = 1;
    while(fgets(line, 80, fp) != NULL) {
    	//sequence header
    	if(line[0] == '>') {
    		sscanf(line, ">Sequence%d length %d", &sequenceNum, &sequenceLength);
    		sequenceNum--; //keep 0 indexed
            //note: sequenceLength is multiplied by 2, because it seemed that some of the "lengths" were off in the file
    		char* temp  = malloc((sequenceLength*2)*sizeof(char));
            if(temp == NULL) {
                return -1;
            }
            sequences[sequenceNum] = temp;
    		beginningOfSequence = 1;
        //read the line (if its the start of a sequence)
    	} else if(beginningOfSequence) {
    		strcpy(sequences[sequenceNum], line);
    		beginningOfSequence = 0;
        //otherwise append it to the current sequences
    	} else {
    		strcat(sequences[sequenceNum], line);
    	}
        //clear line (to be safe)
        memset(line, 0, 80*sizeof(char));
    }
    fclose(fp);

    //determine frequency
    int a_count = 0, c_count = 0, g_count = 0, t_count = 0;
    double total_count = 0; //to force double division later for probabilities
    for(int i = 0; i < totalSequences; i++) {
    	for(int j = 0; j < strlen(sequences[i]); j++) {
    		switch(sequences[i][j]) {
    			case 'A':
    				a_count++;
    				total_count++;
    				break;
    			case 'C':
    				c_count++;
    				total_count++;
    				break;
    			case 'G':
    				g_count++;
    				total_count++;
    				break;
    			case 'T':
    				t_count++;
    				total_count++;
    				break;
    		}
    	}
    }
    //only calculate this once
    double a_freq = a_count/total_count;
    double c_freq = c_count/total_count;
    double g_freq = g_count/total_count;
    double t_freq = t_count/total_count;
    double freqs[4];
    freqs[0] = a_freq;
    freqs[1] = c_freq;
    freqs[2] = g_freq;
    freqs[3] = t_freq;

    //initialize our profile matrix
    int* profile[4];
    for(int i = 0; i < 4; i++) {
    	profile[i] = (int*)malloc((k+1)*sizeof(int));//slot 0 will be the total count
    	memset(profile[i], 0, (k+1)*sizeof(int));
    }

    //pick random starting locations
    int startIndices[totalSequences];
    srand(time(NULL));
    for(int i = 0; i < totalSequences; i++) {
    	startIndices[i] = rand() % (strlen(sequences[i])-k);
    }
    generate_profile(startIndices, profile, sequences, totalSequences);
    //kind of hacky to avoid errors when d == 0
    int* dontCares = malloc(((d==0)?1:d)*sizeof(int));
    int* bestDontCares = malloc(((d==0)?1:d)*sizeof(int));
    //grab the inital score
    double logScore = profile_score(profile, totalSequences, freqs, dontCares);
    double bestLogScore = logScore;
    //more timing
    gettimeofday(&tval_after, NULL);
    timersub(&tval_after, &tval_before, &tval_result);

    //proceed until we're out of time
    while(tval_result.tv_sec < t) {
        //for each sequence
        for(int i = 0; i < totalSequences; i++) {
            int oldStartIndex = startIndices[i];
            int currBestStart = oldStartIndex;
            //try each starting position
            remove_from_profile(i, startIndices, profile, sequences);
            for(int j = 0; j < strlen(sequences[i])-k; j++) {
                if(j != oldStartIndex) {
                    startIndices[i] = j;
                    add_to_profile(i, startIndices, profile, sequences);
                    logScore = profile_score(profile, totalSequences, freqs, dontCares);
                    //keep track of the best score so far
                    if(logScore > bestLogScore) {
                        currBestStart = j;
                        bestLogScore = logScore;
                        memcpy(bestDontCares, dontCares, d*sizeof(int));
                    }
                    remove_from_profile(i, startIndices, profile, sequences);
                }
            }
            //reset profile with the best new starting index for sequences i
            startIndices[i] = currBestStart;
            add_to_profile(i, startIndices, profile, sequences);
            //check if we're out of time yet
            gettimeofday(&tval_after, NULL);
            timersub(&tval_after, &tval_before, &tval_result);
            if(tval_result.tv_sec >= t) {
                break;
            }
        }
    }
    //done with algorithm, now just print stuff
    //figure out the motif
    char* motif = (char*)malloc((k+1)*sizeof(char));
    if(motif == NULL) {
        return -1;
    }
    for(int i = 0; i < k; i++) {
        int biggestChar = 0;
        motif[i] = 'A';
        if(isValInArray(i, bestDontCares)) {
            motif[i] = '*';
            continue;
        }
        if(profile[1][i+1] > profile[biggestChar][i+1]) {
            biggestChar = 1;
            motif[i] = 'C';
        }
        if(profile[2][i+1] > profile[biggestChar][i+1]) {
            biggestChar = 2;
            motif[i] = 'G';
        }
        if(profile[3][i+1] > profile[biggestChar][i+1]) {
            biggestChar = 3;
            motif[i] = 'T';
        }
    }
    //print results
    motif[k] = '\0';
    printf("Best motif of length %d with %d don't cares is %s\n", k, d, motif);
    printf("Log likelihood is %lf\n", bestLogScore);
    printf("Loci of the best motif are here:\n");

    for(int i = 0; i < totalSequences; i++) {
        printf("%d\n", startIndices[i]);
        free(sequences[i]);
    }
    //clean up heap
    for(int i = 0; i < 4; i++) {
        free(profile[i]);
    }
    free(dontCares);
    free(bestDontCares);
    free(motif);
    return 0;
}