Exemple #1
0
vector<unsigned int> tTPSFilter::locateRecords(wstring s, float st) {
	vector<int> d(s.size()-1);
	map<int, vector<int> > diffMap;
	map<int, int> TPMap;
	vector<unsigned int> recpos;
	int rootTag;
	int tagCount=0;
	int gap=0;
	size_t interval=0xffffffff;

	/* compute sequence's first difference, keeping only the negative values (i.e. keeping only
	 * fast transitions from very high to very low values, L - H = negative difference). The
	 * difference points are stored and processed in ascending order (higher absolute values first).
	 *
	 * the difference is weighted with the inverse TPC value, the lower, the better
	*/


	auto z=s;
	auto off=trimSequence(z);

	cerr << off << " diff: " << endl;

	for (size_t i=1;i<z.size();i++) {
		d[i-1]=(z[i]-z[i-1])*z[i-1];
		/*if (d[i-1] < 0) {
			cerr << d[i-1]*s[i] << " ";
			diffMap[d[i-1]].push_back(i);
		} else cerr << 0 << " ";*/
	}

	for (size_t i=1;i<d.size();i++) {
		if (sign(d[i-1])==sign(sign(d[i]))) {
			if (sign(d[i])>0)
				d[i] += d[i-1];
			else
				d[i] -= d[i-1];
		} else if (d[i] >= 0) {
			diffMap[d[i-1]].push_back(i+off);
		}
		cerr << (d[i-1]<0?d[i-1]:0) << " ";
	}
	cerr << endl;

	// process lowest values until the gap between points achieve enough sequence coverage
	int l=(*(diffMap.begin())).second[0];
	int r=l;
	for (auto i=diffMap.begin();i!=diffMap.end();i++) {

		for (size_t j=0; j<(*i).second.size();j++) {
			if ((*i).second[j]<l) l = (*i).second[j];
			if ((*i).second[j]>r) r = (*i).second[j];
			TPMap[s[(*i).second[j]]]++;
			cerr << "TPS[" << (*i).second[j] << "] = " << s[(*i).second[j]] << endl;
			if (j>1) {
				size_t itv = abs((*i).second[j]-(*i).second[j-1]);
				if (itv < interval) interval = itv;
			}
		}
		if (interval!=0xffffffff) gap += interval*(*i).second.size();
		if (((float)gap / (float)s.size()) > st) break;
	}

	// find the most frequent tag path code within the lowest difference values
	for (auto i=TPMap.begin();i!=TPMap.end();i++) {
		cerr << (*i).first << " " << (*i).second << endl;
		if ((*i).second > tagCount) {
			tagCount = (*i).second;
			rootTag = (*i).first;
		}
	}

	/* TESTE */
	TPMap.clear();
	float _max_score=0;
	for (size_t i=0;i<=s.size();i++) {
		TPMap[s[i]]++;
	}
	rootTag=0;
	for (auto i=TPMap.begin();i!=TPMap.end();i++) {
		if ((*i).first <= 0) continue;
		if ((*i).second < (s.size()*0.01)) continue;
		float _score = (((float)((*i).second)) / ((float)((*i).first)));
		cerr << "*** SCORE " << (*i).first << " / " << (*i).second << " = " << _score << endl;
		if ( _score >= _max_score) {
			if ((rootTag > (*i).first) || (!rootTag)) {
				tagCount = (*i).second;
				rootTag = (*i).first;
				_max_score = _score;
			}
		}
	}
	cerr << "*** FINAL " << rootTag << " / " << tagCount << " = " << _max_score << endl;
	/* fim TESTE */

	// find the beginning of each record, using the tag path code found before
	for (size_t i=0;i<s.size();i++) {
		if (s[i] == rootTag) {
			//cerr << "root: " << i << " " << nodeSequence[i]->tagName << " : " << nodeSequence[i]->text << endl;
			recpos.push_back(i);
		}
	}

	return recpos;
}
TEST(TrimSequenceTest, trimSequenceTest)
{
    std::string test;
    std::string::iterator result;

    test = "445566";
    result = trimSequence(test, '5', true);
    EXPECT_EQ(result - test.begin() , 2);

    test = "445554555";
    result = trimSequence(test, '5', true);
    EXPECT_EQ(result - test.begin(), 6);

    test = "4455545556";
    result = trimSequence(test, '5', true);
    EXPECT_EQ(result - test.begin(), 6);

    test = "44555455566";
    result = trimSequence(test, '5', true);
    EXPECT_EQ(result - test.begin(), 6);

    test = "665544";
    result = trimSequence(test, '5', false);
    EXPECT_EQ(test.end() - result , 2);

    test = "555455544";
    result = trimSequence(test, '5', false);
    EXPECT_EQ(test.end() - result, 6);

    test = "6555455544";
    result = trimSequence(test, '5', false);
    EXPECT_EQ(test.end() - result, 6);

    // Paul's test cases in TrimSequence.cpp 
    //
    // from the left:
    //
    test = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
    result = trimSequence(test, 'A', true);
    EXPECT_TRUE(result == test.begin());

    test = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
    result = trimSequence(test, '~', true);
    EXPECT_TRUE(result == test.end());

    test = "AAAAABCDEFGHIJKLMNOPQRSTUVWXYZ";
    result = trimSequence(test, 'B', true);
    EXPECT_TRUE(result == (test.begin() + 5));

    test = "AAAAAAAABCDEFGHIJKLMNOPQRSTUVWXYZ";
    result = trimSequence(test, 'B', true);
    EXPECT_TRUE(result == (test.begin() + 8));

    test = "AAAAAAAABCDEFGHIJKLMNOPQRSTUVWXYZ";
    result = trimSequence(test, 'F', true);
    EXPECT_TRUE(result == (test.begin() + 12));

    test = "AAAAAAAABCDEFGHIJKLMNOPQRSTUVWXYZ";
    result = trimSequence(test, '@', true);
    EXPECT_TRUE(result == (test.begin() + 0));

    test = "AAAAAAAABCDEFGHIJKLMNOPQRSTUVWXYZ";
    result = trimSequence(test, '@', true);
    EXPECT_TRUE(result == (test.begin() + 0));

    test = "AAAFAAAABCDEFGHIJKLMNOPQRSTUVWXYZ";
    result = trimSequence(test, 'F', true);
    EXPECT_TRUE(result == (test.begin() + 12));

    //
    // from the right:
    //
    test = "ZYXWVUTSRQPONMLKJIHGFEDCBA";
    result = trimSequence(test, 'A', false);
    EXPECT_TRUE(result == test.end());

    test = "ZYXWVUTSRQPONMLKJIHGFEDCBA";
    result = trimSequence(test, '~', false);
    EXPECT_TRUE(result == test.begin());

    test = "ZYXWVUTSRQPONMLKJIHGFEDCBAAAAA";
    result = trimSequence(test, 'B', false);
    EXPECT_TRUE(result == (test.end() - 5));

    test = "ZYXWVUTSRQPONMLKJIHGFEDCBAAAAAAA";
    result = trimSequence(test, 'B', false);
    EXPECT_TRUE(result == (test.end() - 7));

    test = "ZYXWVUTSRQPONMLKJIHGFEDCBAAAAAAAA";
    result = trimSequence(test, 'F', false);
    EXPECT_TRUE(result == (test.end() - 12));

    test = "ZYXWVUTSRQPONMLKJIHGFEDCBAAAAAAAA";
    result = trimSequence(test, '@', false);
    EXPECT_TRUE(result == (test.end() + 0));

    test = "ZYXWVUTSRQPONMLKJIHGFEDCBAAAAFAAA";
    result = trimSequence(test, 'F', false);
    EXPECT_TRUE(result == (test.end() - 12));
};