Exemple #1
0
int main(int argc, char* argv[]) {
    std::string searchFile(argv[2]);
    uint topN(std::stoi(argv[3]));
    std::ifstream corpus(argv[1]);
    std::istream_iterator<std::string> corpus_it(corpus), eof;
    std::vector<std::string> fileList(corpus_it, eof);
    strIntMap corpusMap, documentMap;
    std::cout << "Loading corpus using files listed in " << argv[1]
              << std::endl;
    loadCorpusAndSearchFiles(corpusMap, documentMap, searchFile, fileList);
    std::cout << "Loaded corpus of " << corpusMap.size() << " words from "
              << fileList.size() << " file(s)" << std::endl
              << "------[ Starting analysis ]------" << std::endl << "Top "
              << topN << " significant words..." << std::endl;
    std::set<tfidfPair> result;
    getTopN(topN, fileList.size(), result, documentMap, corpusMap);
    printTopN(result);
    std::cout << "Lines with 1 or more significant words:" << std::endl;
    countSigWords(searchFile, result);
    return 0;
}
Exemple #2
0
void reads::detectBS()
{
	param.fragLengthMean += param.smoothArm; // add the soomth arm to the average length
	//param.fragLengthVar  = param.fragLengthMean * param.fragLengthMean / 8.0;
	param.fragLengthVar  = param.fragLengthMean * param.fragLengthMean / 10.0;
	//param.fragLengthVar  = param.fragLengthMean * 20.0;
	param.lmLength = (int) (0.8 * param.fragLengthMean);
	param.lmArm = param.lmLength / 2;
	vector<int> f_maxPos, r_maxPos;
	vector<double> f_max, r_max, tmp, tmp2;
	//double R2_theshold = param.R2_cutoff;
	int i, j;
	int totalRC = readsClusterVec.size();
	//int arm = param.smoothArm + param.lmArm;
  int arm;
  if (param.smoothArm > 0) 
	  arm = param.smoothArm + param.lmArm + 5; //param.motifWidth / 4; // add the half motif width for shift the binding site a litter farther from the signal
  else
	  arm = 3 * param.smoothBandwidth + param.lmArm + 5; //param.motifWidth / 4;
	vector<readsCluster> sub_rcv;
	char strand;
	double pval;
	double foldChange;
	double qval1, qval2;
	int chrIdx;
	int pos = 0;
	double R2 = 0;
	double slope = 0;

	if (totalRC == 0)
		return;

	// for linear model
	lm *lm_signal;
	lm_signal = new lm();
	lm_signal->generateX();

	sort(readsClusterVec.begin(), readsClusterVec.end(), mem_fun_ref(&readsCluster::sortByPos));
	for (i=0; i<totalRC; i++)
	{
		strand = readsClusterVec[i].r_strand;
		chrIdx = readsClusterVec[i].r_chrIdx;
		pval = readsClusterVec[i].pval;
		foldChange = readsClusterVec[i].foldChange;
		qval1 = readsClusterVec[i].qval1;
		qval2 = readsClusterVec[i].qval2;
		if (strand == 'f' || strand == '+')
		{

			if (lm_signal->slipSolve(readsClusterVec[i].r_vec, 'f'))
			{
        logVec(lm_signal->slopeVec, tmp);
				multiVec(tmp, lm_signal->R2Vec, tmp2);

        //printf("fwd R2 slope log_slope R2*log_slope vecs:\n");
        //printVec(lm_signal->R2Vec);
        //printVec(lm_signal->slopeVec);
        //printVec(tmp);
        //printVec(tmp2);

				getTopN(tmp2, lm_signal->slopeVec, f_max, f_maxPos, 2, param.motifWidth);
				//getTopN(lm_signal->R2Vec, lm_signal->slopeVec, f_max, f_maxPos, 2, param.motifWidth);
				for (j=0; j<(int)f_maxPos.size(); j++)
				{
					pos = readsClusterVec[i].r_end + arm - f_maxPos[j];
					R2 = lm_signal->R2Vec[f_maxPos[j]];
					slope = lm_signal->slopeVec[f_maxPos[j]];

		      bingdingSite newBS(chrIdx, pos, R2, slope, strand, pval, foldChange, qval1, qval2);
		      BSVec.push_back(newBS);
		      if (slope > maxSlope)
			      maxSlope = slope;
		      if (R2 > maxR2)
			      maxR2 = R2;
		      if (foldChange > maxFC)
			      maxFC = foldChange;

					/*if (lm_signal->R2Vec[f_maxPos[j]] > R2_theshold)
					{
						bingdingSite newBS(readsClusterVec[i].r_chrIdx, readsClusterVec[i].r_end + arm - f_maxPos[j], lm_signal->R2Vec[f_maxPos[j]], lm_signal->slopeVec[f_maxPos[j]], '+');
						//printf("%d\t%d\t%d\t%d\t%f\t%f\t+\n",readsClusterVec[i].r_end, arm, f_maxPos[j], readsClusterVec[i].r_end + arm - f_maxPos[j], lm_signal->R2Vec[f_maxPos[j]], lm_signal->slopeVec[f_maxPos[j]]);
						BSVec.push_back(newBS);
						if (lm_signal->slopeVec[f_maxPos[j]] > maxSlope)
							maxSlope = lm_signal->slopeVec[f_maxPos[j]];
					}*/
				}
			}
		}
		if (strand == 'r' || strand == '-')
		{
			if (lm_signal->slipSolve(readsClusterVec[i].r_vec, 'r'))
			{
        logVec(lm_signal->slopeVec, tmp);
				multiVec(tmp, lm_signal->R2Vec, tmp2);

        //printf("rvs R2 slope log_slope R2*log_slope vecs:\n");
        //printVec(lm_signal->R2Vec);
        //printVec(lm_signal->slopeVec);
        //printVec(tmp);
        //printVec(tmp2);

				getTopN(tmp2, lm_signal->slopeVec, r_max, r_maxPos, 2, param.motifWidth);
				//getTopN(lm_signal->R2Vec, lm_signal->slopeVec, r_max, r_maxPos, 2, param.motifWidth);
				for (j=0; j<(int)r_maxPos.size(); j++)
				{
					pos = readsClusterVec[i].r_start - arm + r_maxPos[j];
					R2 = lm_signal->R2Vec[r_maxPos[j]];
					slope = lm_signal->slopeVec[r_maxPos[j]];

		      bingdingSite newBS(chrIdx, pos, R2, slope, strand, pval, foldChange, qval1, qval2);
		      BSVec.push_back(newBS);
		      if (slope > maxSlope)
			      maxSlope = slope;
		      if (R2 > maxR2)
			      maxR2 = R2;
		      if (foldChange > maxFC)
			      maxFC = foldChange;

					/*if (lm_signal->R2Vec[r_maxPos[j]] > R2_theshold)
					{
						bingdingSite newBS(readsClusterVec[i].r_chrIdx, readsClusterVec[i].r_start - arm + r_maxPos[j], lm_signal->R2Vec[r_maxPos[j]], lm_signal->slopeVec[r_maxPos[j]], '-');
						//printf("%d\t%d\t%d\t%d\t%f\t%f\t-\n",readsClusterVec[i].r_start, arm, r_maxPos[j], readsClusterVec[i].r_start - arm + r_maxPos[j], lm_signal->R2Vec[r_maxPos[j]], lm_signal->slopeVec[r_maxPos[j]]);
						BSVec.push_back(newBS);
						if (lm_signal->slopeVec[r_maxPos[j]] > maxSlope)
							maxSlope = lm_signal->slopeVec[r_maxPos[j]];
					}*/
				}
			}
		}
		//chrIdx(chrIdx), pos(pos), R2(R2), slope(slope), strand(strand), pval(pval), foldChange(foldChange),qval(qval)

		/*if (param.VERBOSE && i % 100 == 0)
			printf("  %.2f%% done...\r", 100.0 * (double) i / (double) totalRC);*/
	}
}