int main(int argc, char* argv[]) { std::string searchFile(argv[2]); uint topN(std::stoi(argv[3])); std::ifstream corpus(argv[1]); std::istream_iterator<std::string> corpus_it(corpus), eof; std::vector<std::string> fileList(corpus_it, eof); strIntMap corpusMap, documentMap; std::cout << "Loading corpus using files listed in " << argv[1] << std::endl; loadCorpusAndSearchFiles(corpusMap, documentMap, searchFile, fileList); std::cout << "Loaded corpus of " << corpusMap.size() << " words from " << fileList.size() << " file(s)" << std::endl << "------[ Starting analysis ]------" << std::endl << "Top " << topN << " significant words..." << std::endl; std::set<tfidfPair> result; getTopN(topN, fileList.size(), result, documentMap, corpusMap); printTopN(result); std::cout << "Lines with 1 or more significant words:" << std::endl; countSigWords(searchFile, result); return 0; }
void reads::detectBS() { param.fragLengthMean += param.smoothArm; // add the soomth arm to the average length //param.fragLengthVar = param.fragLengthMean * param.fragLengthMean / 8.0; param.fragLengthVar = param.fragLengthMean * param.fragLengthMean / 10.0; //param.fragLengthVar = param.fragLengthMean * 20.0; param.lmLength = (int) (0.8 * param.fragLengthMean); param.lmArm = param.lmLength / 2; vector<int> f_maxPos, r_maxPos; vector<double> f_max, r_max, tmp, tmp2; //double R2_theshold = param.R2_cutoff; int i, j; int totalRC = readsClusterVec.size(); //int arm = param.smoothArm + param.lmArm; int arm; if (param.smoothArm > 0) arm = param.smoothArm + param.lmArm + 5; //param.motifWidth / 4; // add the half motif width for shift the binding site a litter farther from the signal else arm = 3 * param.smoothBandwidth + param.lmArm + 5; //param.motifWidth / 4; vector<readsCluster> sub_rcv; char strand; double pval; double foldChange; double qval1, qval2; int chrIdx; int pos = 0; double R2 = 0; double slope = 0; if (totalRC == 0) return; // for linear model lm *lm_signal; lm_signal = new lm(); lm_signal->generateX(); sort(readsClusterVec.begin(), readsClusterVec.end(), mem_fun_ref(&readsCluster::sortByPos)); for (i=0; i<totalRC; i++) { strand = readsClusterVec[i].r_strand; chrIdx = readsClusterVec[i].r_chrIdx; pval = readsClusterVec[i].pval; foldChange = readsClusterVec[i].foldChange; qval1 = readsClusterVec[i].qval1; qval2 = readsClusterVec[i].qval2; if (strand == 'f' || strand == '+') { if (lm_signal->slipSolve(readsClusterVec[i].r_vec, 'f')) { logVec(lm_signal->slopeVec, tmp); multiVec(tmp, lm_signal->R2Vec, tmp2); //printf("fwd R2 slope log_slope R2*log_slope vecs:\n"); //printVec(lm_signal->R2Vec); //printVec(lm_signal->slopeVec); //printVec(tmp); //printVec(tmp2); getTopN(tmp2, lm_signal->slopeVec, f_max, f_maxPos, 2, param.motifWidth); //getTopN(lm_signal->R2Vec, lm_signal->slopeVec, f_max, f_maxPos, 2, param.motifWidth); for (j=0; j<(int)f_maxPos.size(); j++) { pos = readsClusterVec[i].r_end + arm - f_maxPos[j]; R2 = lm_signal->R2Vec[f_maxPos[j]]; slope = lm_signal->slopeVec[f_maxPos[j]]; bingdingSite newBS(chrIdx, pos, R2, slope, strand, pval, foldChange, qval1, qval2); BSVec.push_back(newBS); if (slope > maxSlope) maxSlope = slope; if (R2 > maxR2) maxR2 = R2; if (foldChange > maxFC) maxFC = foldChange; /*if (lm_signal->R2Vec[f_maxPos[j]] > R2_theshold) { bingdingSite newBS(readsClusterVec[i].r_chrIdx, readsClusterVec[i].r_end + arm - f_maxPos[j], lm_signal->R2Vec[f_maxPos[j]], lm_signal->slopeVec[f_maxPos[j]], '+'); //printf("%d\t%d\t%d\t%d\t%f\t%f\t+\n",readsClusterVec[i].r_end, arm, f_maxPos[j], readsClusterVec[i].r_end + arm - f_maxPos[j], lm_signal->R2Vec[f_maxPos[j]], lm_signal->slopeVec[f_maxPos[j]]); BSVec.push_back(newBS); if (lm_signal->slopeVec[f_maxPos[j]] > maxSlope) maxSlope = lm_signal->slopeVec[f_maxPos[j]]; }*/ } } } if (strand == 'r' || strand == '-') { if (lm_signal->slipSolve(readsClusterVec[i].r_vec, 'r')) { logVec(lm_signal->slopeVec, tmp); multiVec(tmp, lm_signal->R2Vec, tmp2); //printf("rvs R2 slope log_slope R2*log_slope vecs:\n"); //printVec(lm_signal->R2Vec); //printVec(lm_signal->slopeVec); //printVec(tmp); //printVec(tmp2); getTopN(tmp2, lm_signal->slopeVec, r_max, r_maxPos, 2, param.motifWidth); //getTopN(lm_signal->R2Vec, lm_signal->slopeVec, r_max, r_maxPos, 2, param.motifWidth); for (j=0; j<(int)r_maxPos.size(); j++) { pos = readsClusterVec[i].r_start - arm + r_maxPos[j]; R2 = lm_signal->R2Vec[r_maxPos[j]]; slope = lm_signal->slopeVec[r_maxPos[j]]; bingdingSite newBS(chrIdx, pos, R2, slope, strand, pval, foldChange, qval1, qval2); BSVec.push_back(newBS); if (slope > maxSlope) maxSlope = slope; if (R2 > maxR2) maxR2 = R2; if (foldChange > maxFC) maxFC = foldChange; /*if (lm_signal->R2Vec[r_maxPos[j]] > R2_theshold) { bingdingSite newBS(readsClusterVec[i].r_chrIdx, readsClusterVec[i].r_start - arm + r_maxPos[j], lm_signal->R2Vec[r_maxPos[j]], lm_signal->slopeVec[r_maxPos[j]], '-'); //printf("%d\t%d\t%d\t%d\t%f\t%f\t-\n",readsClusterVec[i].r_start, arm, r_maxPos[j], readsClusterVec[i].r_start - arm + r_maxPos[j], lm_signal->R2Vec[r_maxPos[j]], lm_signal->slopeVec[r_maxPos[j]]); BSVec.push_back(newBS); if (lm_signal->slopeVec[r_maxPos[j]] > maxSlope) maxSlope = lm_signal->slopeVec[r_maxPos[j]]; }*/ } } } //chrIdx(chrIdx), pos(pos), R2(R2), slope(slope), strand(strand), pval(pval), foldChange(foldChange),qval(qval) /*if (param.VERBOSE && i % 100 == 0) printf(" %.2f%% done...\r", 100.0 * (double) i / (double) totalRC);*/ } }