vector<int> ChainCollectionTester::getLabelIndexes( const Chain& chain) { const vector<wstring>& labels = chain.GetLabels(); vector<int> labelIndexes(labels.size(), -1); for (size_t iter = 0; iter < labels.size(); ++iter) { labelIndexes[iter] = outputAlphabet_->LookUpIndex_NoAdding(labels[iter]); } return labelIndexes; }
Chain FullMorphologyChainTransformer::ForwardTransform(Chain& chain) const { int oldLength = chain.GetSize(); if (oldLength == 0) { return chain; } // Create tokens const vector<wstring>& oldTokens = chain.GetTokens(); vector<wstring> tokens(2 * oldLength - 1); for (size_t nodeIndex = 0; nodeIndex + 1 < oldTokens.size(); ++nodeIndex) { tokens[2 * nodeIndex + 1] = L"copy_" + oldTokens[nodeIndex]; tokens[2 * nodeIndex] = std::move(oldTokens[nodeIndex]); } // Create labels const vector<wstring>& oldLabels = chain.GetLabels(); vector<wstring> labels(2 * oldLength - 1); for (size_t nodeIndex = 0; nodeIndex + 1 < oldLabels.size(); ++nodeIndex) { const wstring& fullLabel = oldLabels[nodeIndex]; if (fullLabel.size() > 0) { vector<wstring> splitted; splitted = Tools::Split(fullLabel, L"@"); labels[2 * nodeIndex] = fullLabel; labels[2 * nodeIndex + 1] = L"POS_" + splitted[0]; } } labels[labels.size() - 1] = oldLabels.back(); // Create features const vector<vector<wstring> >& oldFeatures = chain.GetFeatures(); vector<vector<wstring> > features(2 * oldLength - 1); for (size_t nodeIndex = 0; nodeIndex + 1 < oldLabels.size(); ++nodeIndex) { features[2 * nodeIndex] = oldFeatures[nodeIndex]; features[2 * nodeIndex + 1] = std::move(oldFeatures[nodeIndex]); } features[features.size() - 1] = oldFeatures.back(); // Return chain Chain transformed( std::move(tokens) , std::move(features) , std::move(labels) , vector<vector<wstring> >()); return transformed; }
void ChainCollectionTester::Initialize(const string& testSetFile) { encodedData_.clear(); decodedData_.clear(); cout << "\nChainCollectionTester: starting reading file..." << std::endl; size_t enumerator = 0; wifstream in(testSetFile); Tools::SetLocale(in); Chain chain; // Iterate over file while(in >> chain) { if (chain.GetSize() == 0) { break; } // Add decoded data DecodedCRFData decodedDataToAdd; decodedDataToAdd.labels = chain.GetLabels(); decodedDataToAdd.tokens = chain.GetTokens(); decodedData_.push_back(decodedDataToAdd); // Add encoded data EncodedCRFData encodedDataToAdd; encodedDataToAdd.intLabels = getLabelIndexes(chain); chain = chainTransformer_->ForwardTransform(chain); possibleStateFinder_->FindPossibleStates(&chain); encodedDataToAdd.intFeatures = getFeatureIndexes(chain); encodedDataToAdd.possibleStates = getPossibleStates( chain); encodedData_.push_back(encodedDataToAdd); // Print status of data downloading ++enumerator; std::wcout << "\rHave evaluated " << enumerator << " chains..."; if (enumerator > MAX_NUMBER_IN_TEST_SET) { break; } } in.close(); printf("\rHave evaluated %lu chains...", decodedData_.size()); printf("\nChainCollectionLearner: end reading file...\n"); }