void ImplicitTagRawRulesDeriver::deriveRawRules(const QStringList inputs, const QStringList translationScripts, const QString output) { _validateInputs(inputs, translationScripts, output); LOG_INFO( "Generating implicit tag rules raw file for inputs: " << inputs << ", translation scripts: " << translationScripts << ". Writing to output: " << output << "..."); LOG_VARD(_sortParallelCount); LOG_VARD(_skipFiltering); LOG_VARD(_translateNamesToEnglish); _init(); long eligibleFeatureCount = 0; long totalFeatureCount = 0; for (int i = 0; i < inputs.size(); i++) { boost::shared_ptr<ElementInputStream> inputStream = _getInputStream(inputs.at(i), translationScripts.at(i)); while (inputStream->hasMoreElements()) { ElementPtr element = inputStream->readNextElement(); LOG_VART(element); totalFeatureCount++; assert(_elementCriterion.get()); if (_skipFiltering || _elementCriterion->isSatisfied(element)) { QStringList names = element->getTags().getNames(); assert(!names.isEmpty()); //old_name/former_name generally indicates that an element formerly went by the name, so //not really useful here. if (names.removeAll("old_name") > 0) { LOG_VART("Removed old name tag."); } if (names.removeAll("former_name") > 0) { LOG_VART("Removed former name tag."); } assert(!names.isEmpty()); if (_translateNamesToEnglish) { names = ImplicitTagUtils::translateNamesToEnglish(names, element->getTags(), _translator); } LOG_VART(names); //get back only the tags that we'd be interested in applying to future elements implicitly //based on name const QStringList kvps = _elementCriterion->getEligibleKvps(element->getTags()); assert(!kvps.isEmpty()); if (kvps.isEmpty()) { throw HootException("Kvps empty."); } //parse whole names and token groups _parseNames(names, kvps); eligibleFeatureCount++; if (eligibleFeatureCount % _statusUpdateInterval == 0) { PROGRESS_INFO( "Parsed " << StringUtils::formatLargeNumber(eligibleFeatureCount) << " eligible features / " << StringUtils::formatLargeNumber(totalFeatureCount) << " total features."); } } } _inputReader->finalizePartial(); } _countFile->close(); LOG_INFO( "Parsed " << StringUtils::formatLargeNumber(eligibleFeatureCount) << " eligible features from " << StringUtils::formatLargeNumber(totalFeatureCount) << " total features."); LOG_INFO( "Wrote " << StringUtils::formatLargeNumber(_countFileLineCtr) << " lines to count file."); _sortByTagOccurrence(); //sort in descending count order _removeDuplicatedKeyTypes(); bool tieCountsNeededResolved = false; if (_duplicatedWordTagKeyCountsToValues.size() > 0) { _resolveCountTies(); tieCountsNeededResolved = true; } LOG_INFO( "Extracted " << StringUtils::formatLargeNumber(_wordKeysToCountsValues.size()) << " word/tag associations."); LOG_INFO("Clearing word/tag associations..."); _wordKeysToCountsValues.clear(); if (tieCountsNeededResolved) { _sortByWord(_tieResolvedCountFile); } else { _sortByWord(_dedupedCountFile); } }
void ImplicitTagRulesDatabaseDeriver::deriveRulesDatabase(const QString input, const QString output) { _validateInputs(input, output); LOG_INFO( "Deriving implicit tag rules for input: " << input << ". Writing to output: " << output << "..."); LOG_VARD(_minTagOccurrencesPerWord); LOG_VARD(_minWordLength); LOG_VARD(_customRules.getWordIgnoreFile()); LOG_VARD(_customRules.getTagIgnoreFile()); LOG_VARD(_customRules.getCustomRuleFile()); LOG_VARD(_useSchemaTagValuesForWordsOnly); _customRules.init(); LOG_VARD(_customRules.getWordIgnoreList().size()); LOG_VARD(_customRules.getWordIgnoreList()); LOG_VARD(_customRules.getTagIgnoreList().size()); LOG_VARD(_customRules.getTagIgnoreList()); LOG_VARD(_customRules.getCustomRulesList().size()); LOG_VARD(_customRules.getCustomRulesList()); if (_minTagOccurrencesPerWord == 1 && _minWordLength == 1 && _customRules.getWordIgnoreList().size() == 0 && _customRules.getTagIgnoreList().size() == 0 && _customRules.getCustomRulesList().size() == 0 && !_useSchemaTagValuesForWordsOnly) { LOG_INFO("Skipping filtering, as no filtering criteria were specified..."); if (_minTagOccurrencesPerWord >= 2) { _removeKvpsBelowOccurrenceThreshold(input, _minTagOccurrencesPerWord); _writeRules(_thresholdedCountFile->fileName(), output); } else { LOG_INFO("Skipping count thresholding since threshold = 1..."); _writeRules(input, output); } } else { if (_useSchemaTagValuesForWordsOnly) { _populateSchemaTagValues(); } if (_minTagOccurrencesPerWord >= 2) { _removeKvpsBelowOccurrenceThreshold(input, _minTagOccurrencesPerWord); _applyFiltering(_thresholdedCountFile->fileName()); } else { LOG_INFO("Skipping count thresholding since min occurrence threshold = 1..."); _applyFiltering(input); } _writeRules(_filteredCountFile->fileName(), output); } }