Пример #1
0
void ImplicitTagRawRulesDeriver::deriveRawRules(const QStringList inputs,
                                                const QStringList translationScripts,
                                                const QString output)
{
  _validateInputs(inputs, translationScripts, output);

  LOG_INFO(
    "Generating implicit tag rules raw file for inputs: " << inputs <<
    ", translation scripts: " << translationScripts << ".  Writing to output: " << output << "...");
  LOG_VARD(_sortParallelCount);
  LOG_VARD(_skipFiltering);
  LOG_VARD(_translateNamesToEnglish);

  _init();

  long eligibleFeatureCount = 0;
  long totalFeatureCount = 0;
  for (int i = 0; i < inputs.size(); i++)
  {
    boost::shared_ptr<ElementInputStream> inputStream =
      _getInputStream(inputs.at(i), translationScripts.at(i));
    while (inputStream->hasMoreElements())
    {
      ElementPtr element = inputStream->readNextElement();
      LOG_VART(element);

      totalFeatureCount++;

      assert(_elementCriterion.get());
      if (_skipFiltering || _elementCriterion->isSatisfied(element))
      {
        QStringList names = element->getTags().getNames();
        assert(!names.isEmpty());

        //old_name/former_name generally indicates that an element formerly went by the name, so
        //not really useful here.
        if (names.removeAll("old_name") > 0)
        {
          LOG_VART("Removed old name tag.");
        }
        if (names.removeAll("former_name") > 0)
        {
          LOG_VART("Removed former name tag.");
        }
        assert(!names.isEmpty());

        if (_translateNamesToEnglish)
        {
          names = ImplicitTagUtils::translateNamesToEnglish(names, element->getTags(), _translator);
        }
        LOG_VART(names);

        //get back only the tags that we'd be interested in applying to future elements implicitly
        //based on name
        const QStringList kvps = _elementCriterion->getEligibleKvps(element->getTags());
        assert(!kvps.isEmpty());
        if (kvps.isEmpty())
        {
          throw HootException("Kvps empty.");
        }

        //parse whole names and token groups
        _parseNames(names, kvps);

        eligibleFeatureCount++;

        if (eligibleFeatureCount % _statusUpdateInterval == 0)
        {
          PROGRESS_INFO(
            "Parsed " << StringUtils::formatLargeNumber(eligibleFeatureCount) <<
            " eligible features / " << StringUtils::formatLargeNumber(totalFeatureCount) <<
            " total features.");
        }
      }
    }
    _inputReader->finalizePartial();
  }
  _countFile->close();

  LOG_INFO(
    "Parsed " << StringUtils::formatLargeNumber(eligibleFeatureCount) <<
    " eligible features from " << StringUtils::formatLargeNumber(totalFeatureCount) <<
    " total features.");
  LOG_INFO(
    "Wrote " << StringUtils::formatLargeNumber(_countFileLineCtr) << " lines to count file.");

  _sortByTagOccurrence();   //sort in descending count order
  _removeDuplicatedKeyTypes();
  bool tieCountsNeededResolved = false;
  if (_duplicatedWordTagKeyCountsToValues.size() > 0)
  {
    _resolveCountTies();
    tieCountsNeededResolved = true;
  }
  LOG_INFO(
    "Extracted "  << StringUtils::formatLargeNumber(_wordKeysToCountsValues.size()) <<
    " word/tag associations.");
  LOG_INFO("Clearing word/tag associations...");
  _wordKeysToCountsValues.clear();
  if (tieCountsNeededResolved)
  {
    _sortByWord(_tieResolvedCountFile);
  }
  else
  {
    _sortByWord(_dedupedCountFile);
  }
}
Пример #2
0
void ImplicitTagRulesDatabaseDeriver::deriveRulesDatabase(const QString input, const QString output)
{
  _validateInputs(input, output);

  LOG_INFO(
    "Deriving implicit tag rules for input: " << input << ".  Writing to output: " <<
    output << "...");

  LOG_VARD(_minTagOccurrencesPerWord);
  LOG_VARD(_minWordLength);
  LOG_VARD(_customRules.getWordIgnoreFile());
  LOG_VARD(_customRules.getTagIgnoreFile());
  LOG_VARD(_customRules.getCustomRuleFile());
  LOG_VARD(_useSchemaTagValuesForWordsOnly);

  _customRules.init();

  LOG_VARD(_customRules.getWordIgnoreList().size());
  LOG_VARD(_customRules.getWordIgnoreList());
  LOG_VARD(_customRules.getTagIgnoreList().size());
  LOG_VARD(_customRules.getTagIgnoreList());
  LOG_VARD(_customRules.getCustomRulesList().size());
  LOG_VARD(_customRules.getCustomRulesList());

  if (_minTagOccurrencesPerWord == 1 && _minWordLength == 1 &&
      _customRules.getWordIgnoreList().size() == 0 && _customRules.getTagIgnoreList().size() == 0 &&
      _customRules.getCustomRulesList().size() == 0 && !_useSchemaTagValuesForWordsOnly)
  {
    LOG_INFO("Skipping filtering, as no filtering criteria were specified...");
    if (_minTagOccurrencesPerWord >= 2)
    {
      _removeKvpsBelowOccurrenceThreshold(input, _minTagOccurrencesPerWord);
      _writeRules(_thresholdedCountFile->fileName(), output);
    }
    else
    {
      LOG_INFO("Skipping count thresholding since threshold = 1...");
      _writeRules(input, output);
    }
  }
  else
  {
    if (_useSchemaTagValuesForWordsOnly)
    {
      _populateSchemaTagValues();
    }

    if (_minTagOccurrencesPerWord >= 2)
    {
      _removeKvpsBelowOccurrenceThreshold(input, _minTagOccurrencesPerWord);
      _applyFiltering(_thresholdedCountFile->fileName());
    }
    else
    {
      LOG_INFO("Skipping count thresholding since min occurrence threshold = 1...");
      _applyFiltering(input);
    }

    _writeRules(_filteredCountFile->fileName(), output);
  }
}