vector<unsigned int> tTPSFilter::locateRecords(wstring s, float st) { vector<int> d(s.size()-1); map<int, vector<int> > diffMap; map<int, int> TPMap; vector<unsigned int> recpos; int rootTag; int tagCount=0; int gap=0; size_t interval=0xffffffff; /* compute sequence's first difference, keeping only the negative values (i.e. keeping only * fast transitions from very high to very low values, L - H = negative difference). The * difference points are stored and processed in ascending order (higher absolute values first). * * the difference is weighted with the inverse TPC value, the lower, the better */ auto z=s; auto off=trimSequence(z); cerr << off << " diff: " << endl; for (size_t i=1;i<z.size();i++) { d[i-1]=(z[i]-z[i-1])*z[i-1]; /*if (d[i-1] < 0) { cerr << d[i-1]*s[i] << " "; diffMap[d[i-1]].push_back(i); } else cerr << 0 << " ";*/ } for (size_t i=1;i<d.size();i++) { if (sign(d[i-1])==sign(sign(d[i]))) { if (sign(d[i])>0) d[i] += d[i-1]; else d[i] -= d[i-1]; } else if (d[i] >= 0) { diffMap[d[i-1]].push_back(i+off); } cerr << (d[i-1]<0?d[i-1]:0) << " "; } cerr << endl; // process lowest values until the gap between points achieve enough sequence coverage int l=(*(diffMap.begin())).second[0]; int r=l; for (auto i=diffMap.begin();i!=diffMap.end();i++) { for (size_t j=0; j<(*i).second.size();j++) { if ((*i).second[j]<l) l = (*i).second[j]; if ((*i).second[j]>r) r = (*i).second[j]; TPMap[s[(*i).second[j]]]++; cerr << "TPS[" << (*i).second[j] << "] = " << s[(*i).second[j]] << endl; if (j>1) { size_t itv = abs((*i).second[j]-(*i).second[j-1]); if (itv < interval) interval = itv; } } if (interval!=0xffffffff) gap += interval*(*i).second.size(); if (((float)gap / (float)s.size()) > st) break; } // find the most frequent tag path code within the lowest difference values for (auto i=TPMap.begin();i!=TPMap.end();i++) { cerr << (*i).first << " " << (*i).second << endl; if ((*i).second > tagCount) { tagCount = (*i).second; rootTag = (*i).first; } } /* TESTE */ TPMap.clear(); float _max_score=0; for (size_t i=0;i<=s.size();i++) { TPMap[s[i]]++; } rootTag=0; for (auto i=TPMap.begin();i!=TPMap.end();i++) { if ((*i).first <= 0) continue; if ((*i).second < (s.size()*0.01)) continue; float _score = (((float)((*i).second)) / ((float)((*i).first))); cerr << "*** SCORE " << (*i).first << " / " << (*i).second << " = " << _score << endl; if ( _score >= _max_score) { if ((rootTag > (*i).first) || (!rootTag)) { tagCount = (*i).second; rootTag = (*i).first; _max_score = _score; } } } cerr << "*** FINAL " << rootTag << " / " << tagCount << " = " << _max_score << endl; /* fim TESTE */ // find the beginning of each record, using the tag path code found before for (size_t i=0;i<s.size();i++) { if (s[i] == rootTag) { //cerr << "root: " << i << " " << nodeSequence[i]->tagName << " : " << nodeSequence[i]->text << endl; recpos.push_back(i); } } return recpos; }
TEST(TrimSequenceTest, trimSequenceTest) { std::string test; std::string::iterator result; test = "445566"; result = trimSequence(test, '5', true); EXPECT_EQ(result - test.begin() , 2); test = "445554555"; result = trimSequence(test, '5', true); EXPECT_EQ(result - test.begin(), 6); test = "4455545556"; result = trimSequence(test, '5', true); EXPECT_EQ(result - test.begin(), 6); test = "44555455566"; result = trimSequence(test, '5', true); EXPECT_EQ(result - test.begin(), 6); test = "665544"; result = trimSequence(test, '5', false); EXPECT_EQ(test.end() - result , 2); test = "555455544"; result = trimSequence(test, '5', false); EXPECT_EQ(test.end() - result, 6); test = "6555455544"; result = trimSequence(test, '5', false); EXPECT_EQ(test.end() - result, 6); // Paul's test cases in TrimSequence.cpp // // from the left: // test = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, 'A', true); EXPECT_TRUE(result == test.begin()); test = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, '~', true); EXPECT_TRUE(result == test.end()); test = "AAAAABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, 'B', true); EXPECT_TRUE(result == (test.begin() + 5)); test = "AAAAAAAABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, 'B', true); EXPECT_TRUE(result == (test.begin() + 8)); test = "AAAAAAAABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, 'F', true); EXPECT_TRUE(result == (test.begin() + 12)); test = "AAAAAAAABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, '@', true); EXPECT_TRUE(result == (test.begin() + 0)); test = "AAAAAAAABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, '@', true); EXPECT_TRUE(result == (test.begin() + 0)); test = "AAAFAAAABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, 'F', true); EXPECT_TRUE(result == (test.begin() + 12)); // // from the right: // test = "ZYXWVUTSRQPONMLKJIHGFEDCBA"; result = trimSequence(test, 'A', false); EXPECT_TRUE(result == test.end()); test = "ZYXWVUTSRQPONMLKJIHGFEDCBA"; result = trimSequence(test, '~', false); EXPECT_TRUE(result == test.begin()); test = "ZYXWVUTSRQPONMLKJIHGFEDCBAAAAA"; result = trimSequence(test, 'B', false); EXPECT_TRUE(result == (test.end() - 5)); test = "ZYXWVUTSRQPONMLKJIHGFEDCBAAAAAAA"; result = trimSequence(test, 'B', false); EXPECT_TRUE(result == (test.end() - 7)); test = "ZYXWVUTSRQPONMLKJIHGFEDCBAAAAAAAA"; result = trimSequence(test, 'F', false); EXPECT_TRUE(result == (test.end() - 12)); test = "ZYXWVUTSRQPONMLKJIHGFEDCBAAAAAAAA"; result = trimSequence(test, '@', false); EXPECT_TRUE(result == (test.end() + 0)); test = "ZYXWVUTSRQPONMLKJIHGFEDCBAAAAFAAA"; result = trimSequence(test, 'F', false); EXPECT_TRUE(result == (test.end() - 12)); };